{ "id": "2412.06534", "version": "v1", "published": "2024-12-09T14:43:06.000Z", "updated": "2024-12-09T14:43:06.000Z", "title": "Inverting Visual Representations with Detection Transformers", "authors": [ "Jan Rathjens", "Shirin Reyhanian", "David Kappel", "Laurenz Wiskott" ], "categories": [ "cs.CV", "cs.AI", "cs.LG", "cs.NE" ], "abstract": "Understanding the mechanisms underlying deep neural networks in computer vision remains a fundamental challenge. While many prior approaches have focused on visualizing intermediate representations within deep neural networks, particularly convolutional neural networks, these techniques have yet to be thoroughly explored in transformer-based vision models. In this study, we apply the approach of training inverse models to reconstruct input images from intermediate layers within a Detection Transformer, showing that this approach is efficient and feasible for transformer-based vision models. Through qualitative and quantitative evaluations of reconstructed images across model stages, we demonstrate critical properties of Detection Transformers, including contextual shape preservation, inter-layer correlation, and robustness to color perturbations, illustrating how these characteristics emerge within the model's architecture. Our findings contribute to a deeper understanding of transformer-based vision models. The code for reproducing our experiments will be made available at github.com/wiskott-lab/inverse-detection-transformer.", "revisions": [ { "version": "v1", "updated": "2024-12-09T14:43:06.000Z" } ], "analyses": { "keywords": [ "detection transformer", "inverting visual representations", "transformer-based vision models", "deep neural networks", "reconstruct input images" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }