{
  "id": "1705.08631",
  "version": "v1",
  "published": "2017-05-24T06:59:30.000Z",
  "updated": "2017-05-24T06:59:30.000Z",
  "title": "Self-supervised learning of visual features through embedding images into text topic spaces",
  "authors": [
    "Lluis Gomez",
    "Yash Patel",
    "Marçal Rusiñol",
    "Dimosthenis Karatzas",
    "C. V. Jawahar"
  ],
  "comment": "Accepted CVPR 2017 paper",
  "categories": [
    "cs.CV"
  ],
  "abstract": "End-to-end training from scratch of current deep architectures for new computer vision problems would require Imagenet-scale datasets, and this is not always possible. In this paper we present a method that is able to take advantage of freely available multi-modal content to train computer vision algorithms without human supervision. We put forward the idea of performing self-supervised learning of visual features by mining a large scale corpus of multi-modal (text and image) documents. We show that discriminative visual features can be learnt efficiently by training a CNN to predict the semantic context in which a particular image is more probable to appear as an illustration. For this we leverage the hidden semantic structures discovered in the text corpus with a well-known topic modeling technique. Our experiments demonstrate state of the art performance in image classification, object detection, and multi-modal retrieval compared to recent self-supervised or natural-supervised approaches.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2017-05-24T06:59:30.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "visual features",
      "text topic spaces",
      "self-supervised learning",
      "embedding images",
      "train computer vision algorithms"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}