{ "id": "1511.06449", "version": "v1", "published": "2015-11-19T23:30:06.000Z", "updated": "2015-11-19T23:30:06.000Z", "title": "Learning to decompose for object detection and instance segmentation", "authors": [ "Eunbyung Park", "Alexander C. Berg" ], "comment": "ICLR 2016 submission", "categories": [ "cs.CV", "cs.LG" ], "abstract": "Although deep convolutional neural networks(CNNs) have achieved remarkable results on object detection and segmentation, pre- and post-processing steps such as region proposals and non-maximum suppression(NMS), have been required. These steps result in high computational complexity and sensitivity to hyperparameters, e.g. thresholds for NMS. In this work, we propose a novel end-to-end trainable deep neural network architecture that generates the correct number of object instances and their bounding boxes (or segmentation masks) given an image, using only a single network evaluation without any pre- or post-processing steps. We have tested on detecting digits in multi-digit images synthesized using MNIST, automatically segmenting digits in these images, and detecting cars in the KITTI benchmark dataset. The proposed approach outperforms a strong CNN baseline on the synthesized digits datasets and shows promising results on KITTI car detection.", "revisions": [ { "version": "v1", "updated": "2015-11-19T23:30:06.000Z" } ], "analyses": { "keywords": [ "object detection", "instance segmentation", "end-to-end trainable deep neural network", "trainable deep neural network architecture", "novel end-to-end trainable deep neural" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable", "adsabs": "2015arXiv151106449P" } } }