{
  "id": "2106.03714",
  "version": "v1",
  "published": "2021-06-07T15:24:54.000Z",
  "updated": "2021-06-07T15:24:54.000Z",
  "title": "Refiner: Refining Self-attention for Vision Transformers",
  "authors": [
    "Daquan Zhou",
    "Yujun Shi",
    "Bingyi Kang",
    "Weihao Yu",
    "Zihang Jiang",
    "Yuan Li",
    "Xiaojie Jin",
    "Qibin Hou",
    "Jiashi Feng"
  ],
  "categories": [
    "cs.CV"
  ],
  "abstract": "Vision Transformers (ViTs) have shown competitive accuracy in image classification tasks compared with CNNs. Yet, they generally require much more data for model pre-training. Most of recent works thus are dedicated to designing more complex architectures or training methods to address the data-efficiency issue of ViTs. However, few of them explore improving the self-attention mechanism, a key factor distinguishing ViTs from CNNs. Different from existing works, we introduce a conceptually simple scheme, called refiner, to directly refine the self-attention maps of ViTs. Specifically, refiner explores attention expansion that projects the multi-head attention maps to a higher-dimensional space to promote their diversity. Further, refiner applies convolutions to augment local patterns of the attention maps, which we show is equivalent to a distributed local attention features are aggregated locally with learnable kernels and then globally aggregated with self-attention. Extensive experiments demonstrate that refiner works surprisingly well. Significantly, it enables ViTs to achieve 86% top-1 classification accuracy on ImageNet with only 81M parameters.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2021-06-07T15:24:54.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "vision transformers",
      "refining self-attention",
      "image classification tasks",
      "multi-head attention maps",
      "refiner applies convolutions"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}