{
  "id": "2204.02547",
  "version": "v1",
  "published": "2022-04-06T02:42:33.000Z",
  "updated": "2022-04-06T02:42:33.000Z",
  "title": "Modeling Motion with Multi-Modal Features for Text-Based Video Segmentation",
  "authors": [
    "Wangbo Zhao",
    "Kai Wang",
    "Xiangxiang Chu",
    "Fuzhao Xue",
    "Xinchao Wang",
    "Yang You"
  ],
  "comment": "Accepted to CVPR2022",
  "categories": [
    "cs.CV",
    "cs.CL"
  ],
  "abstract": "Text-based video segmentation aims to segment the target object in a video based on a describing sentence. Incorporating motion information from optical flow maps with appearance and linguistic modalities is crucial yet has been largely ignored by previous work. In this paper, we design a method to fuse and align appearance, motion, and linguistic features to achieve accurate segmentation. Specifically, we propose a multi-modal video transformer, which can fuse and aggregate multi-modal and temporal features between frames. Furthermore, we design a language-guided feature fusion module to progressively fuse appearance and motion features in each feature level with guidance from linguistic features. Finally, a multi-modal alignment loss is proposed to alleviate the semantic gap between features from different modalities. Extensive experiments on A2D Sentences and J-HMDB Sentences verify the performance and the generalization ability of our method compared to the state-of-the-art methods.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2022-04-06T02:42:33.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "multi-modal features",
      "modeling motion",
      "linguistic features",
      "multi-modal alignment loss",
      "language-guided feature fusion module"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}