{
  "id": "2305.13035",
  "version": "v1",
  "published": "2023-05-22T13:39:28.000Z",
  "updated": "2023-05-22T13:39:28.000Z",
  "title": "Getting ViT in Shape: Scaling Laws for Compute-Optimal Model Design",
  "authors": [
    "Ibrahim Alabdulmohsin",
    "Xiaohua Zhai",
    "Alexander Kolesnikov",
    "Lucas Beyer"
  ],
  "comment": "10 pages, 7 figures, 9 tables",
  "categories": [
    "cs.CV",
    "cs.LG"
  ],
  "abstract": "Scaling laws have been recently employed to derive compute-optimal model size (number of parameters) for a given compute duration. We advance and refine such methods to infer compute-optimal model shapes, such as width and depth, and successfully implement this in vision transformers. Our shape-optimized vision transformer, SoViT, achieves results competitive with models that exceed twice its size, despite being pre-trained with an equivalent amount of compute. For example, SoViT-400m/14 achieves 90.3% fine-tuning accuracy on ILSRCV2012, surpassing the much larger ViT-g/14 and approaching ViT-G/14 under identical settings, with also less than half the inference cost. We conduct a thorough evaluation across multiple tasks, such as image classification, captioning, VQA and zero-shot transfer, demonstrating the effectiveness of our model across a broad range of domains and identifying limitations. Overall, our findings challenge the prevailing approach of blindly scaling up vision models and pave a path for a more informed scaling.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2023-05-22T13:39:28.000Z"
    }
  ],
  "analyses": {
    "subjects": [
      "I.2.6"
    ],
    "keywords": [
      "compute-optimal model design",
      "scaling laws",
      "getting vit",
      "infer compute-optimal model shapes",
      "derive compute-optimal model"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 10,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}