{
  "id": "2401.01970",
  "version": "v1",
  "published": "2024-01-03T20:39:02.000Z",
  "updated": "2024-01-03T20:39:02.000Z",
  "title": "FMGS: Foundation Model Embedded 3D Gaussian Splatting for Holistic 3D Scene Understanding",
  "authors": [
    "Xingxing Zuo",
    "Pouya Samangouei",
    "Yunwen Zhou",
    "Yan Di",
    "Mingyang Li"
  ],
  "comment": "19 pages, Project page coming soon",
  "categories": [
    "cs.CV",
    "cs.AI"
  ],
  "abstract": "Precisely perceiving the geometric and semantic properties of real-world 3D objects is crucial for the continued evolution of augmented reality and robotic applications. To this end, we present \\algfull{} (\\algname{}), which incorporates vision-language embeddings of foundation models into 3D Gaussian Splatting (GS). The key contribution of this work is an efficient method to reconstruct and represent 3D vision-language models. This is achieved by distilling feature maps generated from image-based foundation models into those rendered from our 3D model. To ensure high-quality rendering and fast training, we introduce a novel scene representation by integrating strengths from both GS and multi-resolution hash encodings (MHE). Our effective training procedure also introduces a pixel alignment loss that makes the rendered feature distance of same semantic entities close, following the pixel-level semantic boundaries. Our results demonstrate remarkable multi-view semantic consistency, facilitating diverse downstream tasks, beating state-of-the-art methods by $\\mathbf{10.2}$ percent on open-vocabulary language-based object detection, despite that we are $\\mathbf{851\\times}$ faster for inference. This research explores the intersection of vision, language, and 3D scene representation, paving the way for enhanced scene understanding in uncontrolled real-world environments. We plan to release the code upon paper acceptance.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2024-01-03T20:39:02.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "foundation model embedded 3d gaussian",
      "model embedded 3d gaussian splatting",
      "holistic 3d scene understanding"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 19,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}