{
  "id": "2107.04735",
  "version": "v1",
  "published": "2021-07-10T02:34:55.000Z",
  "updated": "2021-07-10T02:34:55.000Z",
  "title": "Local-to-Global Self-Attention in Vision Transformers",
  "authors": [
    "Jinpeng Li",
    "Yichao Yan",
    "Shengcai Liao",
    "Xiaokang Yang",
    "Ling Shao"
  ],
  "categories": [
    "cs.CV"
  ],
  "abstract": "Transformers have demonstrated great potential in computer vision tasks. To avoid dense computations of self-attentions in high-resolution visual data, some recent Transformer models adopt a hierarchical design, where self-attentions are only computed within local windows. This design significantly improves the efficiency but lacks global feature reasoning in early stages. In this work, we design a multi-path structure of the Transformer, which enables local-to-global reasoning at multiple granularities in each stage. The proposed framework is computationally efficient and highly effective. With a marginal increasement in computational overhead, our model achieves notable improvements in both image classification and semantic segmentation. Code is available at https://github.com/ljpadam/LG-Transformer",
  "revisions": [
    {
      "version": "v1",
      "updated": "2021-07-10T02:34:55.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "vision transformers",
      "local-to-global self-attention",
      "lacks global feature",
      "high-resolution visual data",
      "model achieves notable improvements"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}