{ "id": "2107.04735", "version": "v1", "published": "2021-07-10T02:34:55.000Z", "updated": "2021-07-10T02:34:55.000Z", "title": "Local-to-Global Self-Attention in Vision Transformers", "authors": [ "Jinpeng Li", "Yichao Yan", "Shengcai Liao", "Xiaokang Yang", "Ling Shao" ], "categories": [ "cs.CV" ], "abstract": "Transformers have demonstrated great potential in computer vision tasks. To avoid dense computations of self-attentions in high-resolution visual data, some recent Transformer models adopt a hierarchical design, where self-attentions are only computed within local windows. This design significantly improves the efficiency but lacks global feature reasoning in early stages. In this work, we design a multi-path structure of the Transformer, which enables local-to-global reasoning at multiple granularities in each stage. The proposed framework is computationally efficient and highly effective. With a marginal increasement in computational overhead, our model achieves notable improvements in both image classification and semantic segmentation. Code is available at https://github.com/ljpadam/LG-Transformer", "revisions": [ { "version": "v1", "updated": "2021-07-10T02:34:55.000Z" } ], "analyses": { "keywords": [ "vision transformers", "local-to-global self-attention", "lacks global feature", "high-resolution visual data", "model achieves notable improvements" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }