{ "id": "2204.02547", "version": "v1", "published": "2022-04-06T02:42:33.000Z", "updated": "2022-04-06T02:42:33.000Z", "title": "Modeling Motion with Multi-Modal Features for Text-Based Video Segmentation", "authors": [ "Wangbo Zhao", "Kai Wang", "Xiangxiang Chu", "Fuzhao Xue", "Xinchao Wang", "Yang You" ], "comment": "Accepted to CVPR2022", "categories": [ "cs.CV", "cs.CL" ], "abstract": "Text-based video segmentation aims to segment the target object in a video based on a describing sentence. Incorporating motion information from optical flow maps with appearance and linguistic modalities is crucial yet has been largely ignored by previous work. In this paper, we design a method to fuse and align appearance, motion, and linguistic features to achieve accurate segmentation. Specifically, we propose a multi-modal video transformer, which can fuse and aggregate multi-modal and temporal features between frames. Furthermore, we design a language-guided feature fusion module to progressively fuse appearance and motion features in each feature level with guidance from linguistic features. Finally, a multi-modal alignment loss is proposed to alleviate the semantic gap between features from different modalities. Extensive experiments on A2D Sentences and J-HMDB Sentences verify the performance and the generalization ability of our method compared to the state-of-the-art methods.", "revisions": [ { "version": "v1", "updated": "2022-04-06T02:42:33.000Z" } ], "analyses": { "keywords": [ "multi-modal features", "modeling motion", "linguistic features", "multi-modal alignment loss", "language-guided feature fusion module" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }