{
  "id": "2004.10605",
  "version": "v1",
  "published": "2020-04-18T10:14:06.000Z",
  "updated": "2020-04-18T10:14:06.000Z",
  "title": "Self-Supervised Representation Learning on Document Images",
  "authors": [
    "Adrian Cosma",
    "Mihai Ghidoveanu",
    "Michael Panaitescu-Liess",
    "Marius Popescu"
  ],
  "comment": "15 pages, 5 figures. Accepted at DAS 2020: IAPR International Workshop on Document Analysis Systems",
  "categories": [
    "cs.CV",
    "cs.LG",
    "eess.IV",
    "stat.ML"
  ],
  "abstract": "This work analyses the impact of self-supervised pre-training on document images. While previous approaches explore the effect of self-supervision on natural images, we show that patch-based pre-training performs poorly on text document images because of their different structural properties and poor intra-sample semantic information. We propose two context-aware alternatives to improve performance. We also propose a novel method for self-supervision, which makes use of the inherent multi-modality of documents (image and text), which performs better than other popular self-supervised methods, including supervised ImageNet pre-training.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2020-04-18T10:14:06.000Z"
    }
  ],
  "analyses": {
    "subjects": [
      "68T05"
    ],
    "keywords": [
      "self-supervised representation learning",
      "poor intra-sample semantic information",
      "text document images",
      "inherent multi-modality",
      "novel method"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 15,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}