{ "id": "2104.10972", "version": "v1", "published": "2021-04-22T10:10:14.000Z", "updated": "2021-04-22T10:10:14.000Z", "title": "ImageNet-21K Pretraining for the Masses", "authors": [ "Tal Ridnik", "Emanuel Ben-Baruch", "Asaf Noy", "Lihi Zelnik-Manor" ], "categories": [ "cs.CV", "cs.LG" ], "abstract": "ImageNet-1K serves as the primary dataset for pretraining deep learning models for computer vision tasks. ImageNet-21K dataset, which contains more pictures and classes, is used less frequently for pretraining, mainly due to its complexity, and underestimation of its added value compared to standard ImageNet-1K pretraining. This paper aims to close this gap, and make high-quality efficient pretraining on ImageNet-21K available for everyone. % Via a dedicated preprocessing stage, utilizing WordNet hierarchies, and a novel training scheme called semantic softmax, we show that various models, including small mobile-oriented models, significantly benefit from ImageNet-21K pretraining on numerous datasets and tasks. We also show that we outperform previous ImageNet-21K pretraining schemes for prominent new models like ViT. % Our proposed pretraining pipeline is efficient, accessible, and leads to SoTA reproducible results, from a publicly available dataset. The training code and pretrained models are available at: https://github.com/Alibaba-MIIL/ImageNet21K", "revisions": [ { "version": "v1", "updated": "2021-04-22T10:10:14.000Z" } ], "analyses": { "keywords": [ "computer vision tasks", "imagenet-21k pretraining schemes", "primary dataset", "high-quality efficient", "small mobile-oriented models" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }