{ "id": "2106.15788", "version": "v1", "published": "2021-06-30T02:56:26.000Z", "updated": "2021-06-30T02:56:26.000Z", "title": "Align Yourself: Self-supervised Pre-training for Fine-grained Recognition via Saliency Alignment", "authors": [ "Di Wu", "Siyuan Li", "Zelin Zang", "Kai Wang", "Lei Shang", "Baigui Sun", "Hao Li", "Stan Z. Li" ], "categories": [ "cs.CV" ], "abstract": "Self-supervised contrastive learning has demonstrated great potential in learning visual representations. Despite their success on various downstream tasks such as image classification and object detection, self-supervised pre-training for fine-grained scenarios is not fully explored. In this paper, we first point out that current contrastive methods are prone to memorizing background/foreground texture and therefore have a limitation in localizing the foreground object. Analysis suggests that learning to extract discriminative texture information and localization are equally crucial for self-supervised pre-training under fine-grained scenarios. Based on our findings, we introduce Cross-view Saliency Alignment (CVSA), a contrastive learning framework that first crops and swaps saliency regions of images as a novel view generation and then guides the model to localize on the foreground object via a cross-view alignment loss. Extensive experiments on four popular fine-grained classification benchmarks show that CVSA significantly improves the learned representation.", "revisions": [ { "version": "v1", "updated": "2021-06-30T02:56:26.000Z" } ], "analyses": { "keywords": [ "self-supervised pre-training", "fine-grained recognition", "foreground object", "extract discriminative texture information", "fine-grained scenarios" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }