{
  "id": "2203.16434",
  "version": "v1",
  "published": "2022-03-30T16:31:49.000Z",
  "updated": "2022-03-30T16:31:49.000Z",
  "title": "TubeDETR: Spatio-Temporal Video Grounding with Transformers",
  "authors": [
    "Antoine Yang",
    "Antoine Miech",
    "Josef Sivic",
    "Ivan Laptev",
    "Cordelia Schmid"
  ],
  "comment": "Accepted at CVPR 2022 (Oral); 17 pages; 8 figures",
  "categories": [
    "cs.CV",
    "cs.CL",
    "cs.LG"
  ],
  "abstract": "We consider the problem of localizing a spatio-temporal tube in a video corresponding to a given text query. This is a challenging task that requires the joint and efficient modeling of temporal, spatial and multi-modal interactions. To address this task, we propose TubeDETR, a transformer-based architecture inspired by the recent success of such models for text-conditioned object detection. Our model notably includes: (i) an efficient video and text encoder that models spatial multi-modal interactions over sparsely sampled frames and (ii) a space-time decoder that jointly performs spatio-temporal localization. We demonstrate the advantage of our proposed components through an extensive ablation study. We also evaluate our full approach on the spatio-temporal video grounding task and demonstrate improvements over the state of the art on the challenging VidSTG and HC-STVG benchmarks. Code and trained models are publicly available at https://antoyang.github.io/tubedetr.html.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2022-03-30T16:31:49.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "transformers",
      "models spatial multi-modal interactions",
      "jointly performs spatio-temporal localization",
      "spatio-temporal video grounding task",
      "object detection"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 17,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}