{
  "id": "2412.06534",
  "version": "v1",
  "published": "2024-12-09T14:43:06.000Z",
  "updated": "2024-12-09T14:43:06.000Z",
  "title": "Inverting Visual Representations with Detection Transformers",
  "authors": [
    "Jan Rathjens",
    "Shirin Reyhanian",
    "David Kappel",
    "Laurenz Wiskott"
  ],
  "categories": [
    "cs.CV",
    "cs.AI",
    "cs.LG",
    "cs.NE"
  ],
  "abstract": "Understanding the mechanisms underlying deep neural networks in computer vision remains a fundamental challenge. While many prior approaches have focused on visualizing intermediate representations within deep neural networks, particularly convolutional neural networks, these techniques have yet to be thoroughly explored in transformer-based vision models. In this study, we apply the approach of training inverse models to reconstruct input images from intermediate layers within a Detection Transformer, showing that this approach is efficient and feasible for transformer-based vision models. Through qualitative and quantitative evaluations of reconstructed images across model stages, we demonstrate critical properties of Detection Transformers, including contextual shape preservation, inter-layer correlation, and robustness to color perturbations, illustrating how these characteristics emerge within the model's architecture. Our findings contribute to a deeper understanding of transformer-based vision models. The code for reproducing our experiments will be made available at github.com/wiskott-lab/inverse-detection-transformer.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2024-12-09T14:43:06.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "detection transformer",
      "inverting visual representations",
      "transformer-based vision models",
      "deep neural networks",
      "reconstruct input images"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}