{ "id": "2004.10605", "version": "v1", "published": "2020-04-18T10:14:06.000Z", "updated": "2020-04-18T10:14:06.000Z", "title": "Self-Supervised Representation Learning on Document Images", "authors": [ "Adrian Cosma", "Mihai Ghidoveanu", "Michael Panaitescu-Liess", "Marius Popescu" ], "comment": "15 pages, 5 figures. Accepted at DAS 2020: IAPR International Workshop on Document Analysis Systems", "categories": [ "cs.CV", "cs.LG", "eess.IV", "stat.ML" ], "abstract": "This work analyses the impact of self-supervised pre-training on document images. While previous approaches explore the effect of self-supervision on natural images, we show that patch-based pre-training performs poorly on text document images because of their different structural properties and poor intra-sample semantic information. We propose two context-aware alternatives to improve performance. We also propose a novel method for self-supervision, which makes use of the inherent multi-modality of documents (image and text), which performs better than other popular self-supervised methods, including supervised ImageNet pre-training.", "revisions": [ { "version": "v1", "updated": "2020-04-18T10:14:06.000Z" } ], "analyses": { "subjects": [ "68T05" ], "keywords": [ "self-supervised representation learning", "poor intra-sample semantic information", "text document images", "inherent multi-modality", "novel method" ], "note": { "typesetting": "TeX", "pages": 15, "language": "en", "license": "arXiv", "status": "editable" } } }