{ "id": "2209.15056", "version": "v1", "published": "2022-09-29T18:57:52.000Z", "updated": "2022-09-29T18:57:52.000Z", "title": "Graph Attention Network for Camera Relocalization on Dynamic Scenes", "authors": [ "Mohamed Amine Ouali", "Mohamed Bouguessa", "Riadh Ksantini" ], "categories": [ "cs.CV", "cs.AI", "cs.LG" ], "abstract": "We devise a graph attention network-based approach for learning a scene triangle mesh representation in order to estimate an image camera position in a dynamic environment. Previous approaches built a scene-dependent model that explicitly or implicitly embeds the structure of the scene. They use convolution neural networks or decision trees to establish 2D/3D-3D correspondences. Such a mapping overfits the target scene and does not generalize well to dynamic changes in the environment. Our work introduces a novel approach to solve the camera relocalization problem by using the available triangle mesh. Our 3D-3D matching framework consists of three blocks: (1) a graph neural network to compute the embedding of mesh vertices, (2) a convolution neural network to compute the embedding of grid cells defined on the RGB-D image, and (3) a neural network model to establish the correspondence between the two embeddings. These three components are trained end-to-end. To predict the final pose, we run the RANSAC algorithm to generate camera pose hypotheses, and we refine the prediction using the point-cloud representation. Our approach significantly improves the camera pose accuracy of the state-of-the-art method from $0.358$ to $0.506$ on the RIO10 benchmark for dynamic indoor camera relocalization.", "revisions": [ { "version": "v1", "updated": "2022-09-29T18:57:52.000Z" } ], "analyses": { "keywords": [ "dynamic scenes", "convolution neural network", "generate camera pose hypotheses", "dynamic indoor camera relocalization", "scene triangle mesh representation" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }