{
  "id": "1912.00544",
  "version": "v1",
  "published": "2019-12-02T02:08:00.000Z",
  "updated": "2019-12-02T02:08:00.000Z",
  "title": "Multi-Scale Self-Attention for Text Classification",
  "authors": [
    "Qipeng Guo",
    "Xipeng Qiu",
    "Pengfei Liu",
    "Xiangyang Xue",
    "Zheng Zhang"
  ],
  "comment": "Accepted in AAAI2020",
  "categories": [
    "cs.CL",
    "cs.LG"
  ],
  "abstract": "In this paper, we introduce the prior knowledge, multi-scale structure, into self-attention modules. We propose a Multi-Scale Transformer which uses multi-scale multi-head self-attention to capture features from different scales. Based on the linguistic perspective and the analysis of pre-trained Transformer (BERT) on a huge corpus, we further design a strategy to control the scale distribution for each layer. Results of three different kinds of tasks (21 datasets) show our Multi-Scale Transformer outperforms the standard Transformer consistently and significantly on small and moderate size datasets.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2019-12-02T02:08:00.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "text classification",
      "multi-scale self-attention",
      "multi-scale multi-head self-attention",
      "multi-scale transformer outperforms",
      "capture features"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}