{
  "id": "2104.10972",
  "version": "v1",
  "published": "2021-04-22T10:10:14.000Z",
  "updated": "2021-04-22T10:10:14.000Z",
  "title": "ImageNet-21K Pretraining for the Masses",
  "authors": [
    "Tal Ridnik",
    "Emanuel Ben-Baruch",
    "Asaf Noy",
    "Lihi Zelnik-Manor"
  ],
  "categories": [
    "cs.CV",
    "cs.LG"
  ],
  "abstract": "ImageNet-1K serves as the primary dataset for pretraining deep learning models for computer vision tasks. ImageNet-21K dataset, which contains more pictures and classes, is used less frequently for pretraining, mainly due to its complexity, and underestimation of its added value compared to standard ImageNet-1K pretraining. This paper aims to close this gap, and make high-quality efficient pretraining on ImageNet-21K available for everyone. % Via a dedicated preprocessing stage, utilizing WordNet hierarchies, and a novel training scheme called semantic softmax, we show that various models, including small mobile-oriented models, significantly benefit from ImageNet-21K pretraining on numerous datasets and tasks. We also show that we outperform previous ImageNet-21K pretraining schemes for prominent new models like ViT. % Our proposed pretraining pipeline is efficient, accessible, and leads to SoTA reproducible results, from a publicly available dataset. The training code and pretrained models are available at: https://github.com/Alibaba-MIIL/ImageNet21K",
  "revisions": [
    {
      "version": "v1",
      "updated": "2021-04-22T10:10:14.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "computer vision tasks",
      "imagenet-21k pretraining schemes",
      "primary dataset",
      "high-quality efficient",
      "small mobile-oriented models"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}