{ "id": "1902.03604", "version": "v1", "published": "2019-02-10T14:01:22.000Z", "updated": "2019-02-10T14:01:22.000Z", "title": "MOTS: Multi-Object Tracking and Segmentation", "authors": [ "Paul Voigtlaender", "Michael Krause", "Aljosa Osep", "Jonathon Luiten", "Berin Balachandar Gnana Sekar", "Andreas Geiger", "Bastian Leibe" ], "categories": [ "cs.CV" ], "abstract": "This paper extends the popular task of multi-object tracking to multi-object tracking and segmentation (MOTS). Towards this goal, we create dense pixel-level annotations for two existing tracking datasets using a semi-automatic annotation procedure. Our new annotations comprise 70,430 pixel masks for 1,084 distinct objects (cars and pedestrians) in 10,870 video frames. For evaluation, we extend existing multi-object tracking metrics to this new task. Moreover, we propose a new baseline method which jointly addresses detection, tracking, and segmentation with a single convolutional network. We demonstrate the value of our datasets by achieving improvements in performance when training on MOTS annotations. We believe that our datasets, metrics and baseline will become a valuable resource towards developing multi-object tracking approaches that go beyond 2D bounding boxes.", "revisions": [ { "version": "v1", "updated": "2019-02-10T14:01:22.000Z" } ], "analyses": { "keywords": [ "segmentation", "create dense pixel-level annotations", "extend existing multi-object tracking metrics", "semi-automatic annotation procedure", "single convolutional network" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }