{
  "id": "1511.06449",
  "version": "v1",
  "published": "2015-11-19T23:30:06.000Z",
  "updated": "2015-11-19T23:30:06.000Z",
  "title": "Learning to decompose for object detection and instance segmentation",
  "authors": [
    "Eunbyung Park",
    "Alexander C. Berg"
  ],
  "comment": "ICLR 2016 submission",
  "categories": [
    "cs.CV",
    "cs.LG"
  ],
  "abstract": "Although deep convolutional neural networks(CNNs) have achieved remarkable results on object detection and segmentation, pre- and post-processing steps such as region proposals and non-maximum suppression(NMS), have been required. These steps result in high computational complexity and sensitivity to hyperparameters, e.g. thresholds for NMS. In this work, we propose a novel end-to-end trainable deep neural network architecture that generates the correct number of object instances and their bounding boxes (or segmentation masks) given an image, using only a single network evaluation without any pre- or post-processing steps. We have tested on detecting digits in multi-digit images synthesized using MNIST, automatically segmenting digits in these images, and detecting cars in the KITTI benchmark dataset. The proposed approach outperforms a strong CNN baseline on the synthesized digits datasets and shows promising results on KITTI car detection.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2015-11-19T23:30:06.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "object detection",
      "instance segmentation",
      "end-to-end trainable deep neural network",
      "trainable deep neural network architecture",
      "novel end-to-end trainable deep neural"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable",
      "adsabs": "2015arXiv151106449P"
    }
  }
}