{
  "id": "2105.03343",
  "version": "v1",
  "published": "2021-05-07T15:51:08.000Z",
  "updated": "2021-05-07T15:51:08.000Z",
  "title": "Adapting by Pruning: A Case Study on BERT",
  "authors": [
    "Yang Gao",
    "Nicolo Colombo",
    "Wei Wang"
  ],
  "categories": [
    "cs.LG",
    "cs.CL"
  ],
  "abstract": "Adapting pre-trained neural models to downstream tasks has become the standard practice for obtaining high-quality models. In this work, we propose a novel model adaptation paradigm, adapting by pruning, which prunes neural connections in the pre-trained model to optimise the performance on the target task; all remaining connections have their weights intact. We formulate adapting-by-pruning as an optimisation problem with a differentiable loss and propose an efficient algorithm to prune the model. We prove that the algorithm is near-optimal under standard assumptions and apply the algorithm to adapt BERT to some GLUE tasks. Results suggest that our method can prune up to 50% weights in BERT while yielding similar performance compared to the fine-tuned full model. We also compare our method with other state-of-the-art pruning methods and study the topological differences of their obtained sub-networks.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2021-05-07T15:51:08.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "case study",
      "novel model adaptation paradigm",
      "prunes neural connections",
      "yielding similar performance",
      "obtaining high-quality models"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}