{ "id": "2401.01970", "version": "v1", "published": "2024-01-03T20:39:02.000Z", "updated": "2024-01-03T20:39:02.000Z", "title": "FMGS: Foundation Model Embedded 3D Gaussian Splatting for Holistic 3D Scene Understanding", "authors": [ "Xingxing Zuo", "Pouya Samangouei", "Yunwen Zhou", "Yan Di", "Mingyang Li" ], "comment": "19 pages, Project page coming soon", "categories": [ "cs.CV", "cs.AI" ], "abstract": "Precisely perceiving the geometric and semantic properties of real-world 3D objects is crucial for the continued evolution of augmented reality and robotic applications. To this end, we present \\algfull{} (\\algname{}), which incorporates vision-language embeddings of foundation models into 3D Gaussian Splatting (GS). The key contribution of this work is an efficient method to reconstruct and represent 3D vision-language models. This is achieved by distilling feature maps generated from image-based foundation models into those rendered from our 3D model. To ensure high-quality rendering and fast training, we introduce a novel scene representation by integrating strengths from both GS and multi-resolution hash encodings (MHE). Our effective training procedure also introduces a pixel alignment loss that makes the rendered feature distance of same semantic entities close, following the pixel-level semantic boundaries. Our results demonstrate remarkable multi-view semantic consistency, facilitating diverse downstream tasks, beating state-of-the-art methods by $\\mathbf{10.2}$ percent on open-vocabulary language-based object detection, despite that we are $\\mathbf{851\\times}$ faster for inference. This research explores the intersection of vision, language, and 3D scene representation, paving the way for enhanced scene understanding in uncontrolled real-world environments. We plan to release the code upon paper acceptance.", "revisions": [ { "version": "v1", "updated": "2024-01-03T20:39:02.000Z" } ], "analyses": { "keywords": [ "foundation model embedded 3d gaussian", "model embedded 3d gaussian splatting", "holistic 3d scene understanding" ], "note": { "typesetting": "TeX", "pages": 19, "language": "en", "license": "arXiv", "status": "editable" } } }