{ "id": "2306.08068", "version": "v1", "published": "2023-06-13T18:32:35.000Z", "updated": "2023-06-13T18:32:35.000Z", "title": "DORSal: Diffusion for Object-centric Representations of Scenes $\\textit{et al.}$", "authors": [ "Allan Jabri", "Sjoerd van Steenkiste", "Emiel Hoogeboom", "Mehdi S. M. Sajjadi", "Thomas Kipf" ], "comment": "Project page: https://www.sjoerdvansteenkiste.com/dorsal", "categories": [ "cs.CV", "cs.AI", "cs.LG" ], "abstract": "Recent progress in 3D scene understanding enables scalable learning of representations across large datasets of diverse scenes. As a consequence, generalization to unseen scenes and objects, rendering novel views from just a single or a handful of input images, and controllable scene generation that supports editing, is now possible. However, training jointly on a large number of scenes typically compromises rendering quality when compared to single-scene optimized models such as NeRFs. In this paper, we leverage recent progress in diffusion models to equip 3D scene representation learning models with the ability to render high-fidelity novel views, while retaining benefits such as object-level scene editing to a large degree. In particular, we propose DORSal, which adapts a video diffusion architecture for 3D scene generation conditioned on object-centric slot-based representations of scenes. On both complex synthetic multi-object scenes and on the real-world large-scale Street View dataset, we show that DORSal enables scalable neural rendering of 3D scenes with object-level editing and improves upon existing approaches.", "revisions": [ { "version": "v1", "updated": "2023-06-13T18:32:35.000Z" } ], "analyses": { "keywords": [ "object-centric representations", "scene representation learning models", "3d scene representation learning", "typically compromises rendering quality", "real-world large-scale street view dataset" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }