{
  "id": "1902.03604",
  "version": "v1",
  "published": "2019-02-10T14:01:22.000Z",
  "updated": "2019-02-10T14:01:22.000Z",
  "title": "MOTS: Multi-Object Tracking and Segmentation",
  "authors": [
    "Paul Voigtlaender",
    "Michael Krause",
    "Aljosa Osep",
    "Jonathon Luiten",
    "Berin Balachandar Gnana Sekar",
    "Andreas Geiger",
    "Bastian Leibe"
  ],
  "categories": [
    "cs.CV"
  ],
  "abstract": "This paper extends the popular task of multi-object tracking to multi-object tracking and segmentation (MOTS). Towards this goal, we create dense pixel-level annotations for two existing tracking datasets using a semi-automatic annotation procedure. Our new annotations comprise 70,430 pixel masks for 1,084 distinct objects (cars and pedestrians) in 10,870 video frames. For evaluation, we extend existing multi-object tracking metrics to this new task. Moreover, we propose a new baseline method which jointly addresses detection, tracking, and segmentation with a single convolutional network. We demonstrate the value of our datasets by achieving improvements in performance when training on MOTS annotations. We believe that our datasets, metrics and baseline will become a valuable resource towards developing multi-object tracking approaches that go beyond 2D bounding boxes.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2019-02-10T14:01:22.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "segmentation",
      "create dense pixel-level annotations",
      "extend existing multi-object tracking metrics",
      "semi-automatic annotation procedure",
      "single convolutional network"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}