{
  "id": "2106.11539",
  "version": "v1",
  "published": "2021-06-22T04:28:07.000Z",
  "updated": "2021-06-22T04:28:07.000Z",
  "title": "DocFormer: End-to-End Transformer for Document Understanding",
  "authors": [
    "Srikar Appalaraju",
    "Bhavan Jasani",
    "Bhargava Urala Kota",
    "Yusheng Xie",
    "R. Manmatha"
  ],
  "categories": [
    "cs.CV"
  ],
  "abstract": "We present DocFormer -- a multi-modal transformer based architecture for the task of Visual Document Understanding (VDU). VDU is a challenging problem which aims to understand documents in their varied formats (forms, receipts etc.) and layouts. In addition, DocFormer is pre-trained in an unsupervised fashion using carefully designed tasks which encourage multi-modal interaction. DocFormer uses text, vision and spatial features and combines them using a novel multi-modal self-attention layer. DocFormer also shares learned spatial embeddings across modalities which makes it easy for the model to correlate text to visual tokens and vice versa. DocFormer is evaluated on 4 different datasets each with strong baselines. DocFormer achieves state-of-the-art results on all of them, sometimes beating models 4x its size (in no. of parameters).",
  "revisions": [
    {
      "version": "v1",
      "updated": "2021-06-22T04:28:07.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "end-to-end transformer",
      "document understanding",
      "docformer achieves state-of-the-art results",
      "novel multi-modal self-attention layer",
      "shares learned spatial embeddings"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}