{ "id": "2105.03343", "version": "v1", "published": "2021-05-07T15:51:08.000Z", "updated": "2021-05-07T15:51:08.000Z", "title": "Adapting by Pruning: A Case Study on BERT", "authors": [ "Yang Gao", "Nicolo Colombo", "Wei Wang" ], "categories": [ "cs.LG", "cs.CL" ], "abstract": "Adapting pre-trained neural models to downstream tasks has become the standard practice for obtaining high-quality models. In this work, we propose a novel model adaptation paradigm, adapting by pruning, which prunes neural connections in the pre-trained model to optimise the performance on the target task; all remaining connections have their weights intact. We formulate adapting-by-pruning as an optimisation problem with a differentiable loss and propose an efficient algorithm to prune the model. We prove that the algorithm is near-optimal under standard assumptions and apply the algorithm to adapt BERT to some GLUE tasks. Results suggest that our method can prune up to 50% weights in BERT while yielding similar performance compared to the fine-tuned full model. We also compare our method with other state-of-the-art pruning methods and study the topological differences of their obtained sub-networks.", "revisions": [ { "version": "v1", "updated": "2021-05-07T15:51:08.000Z" } ], "analyses": { "keywords": [ "case study", "novel model adaptation paradigm", "prunes neural connections", "yielding similar performance", "obtaining high-quality models" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }