{
  "id": "2106.15788",
  "version": "v1",
  "published": "2021-06-30T02:56:26.000Z",
  "updated": "2021-06-30T02:56:26.000Z",
  "title": "Align Yourself: Self-supervised Pre-training for Fine-grained Recognition via Saliency Alignment",
  "authors": [
    "Di Wu",
    "Siyuan Li",
    "Zelin Zang",
    "Kai Wang",
    "Lei Shang",
    "Baigui Sun",
    "Hao Li",
    "Stan Z. Li"
  ],
  "categories": [
    "cs.CV"
  ],
  "abstract": "Self-supervised contrastive learning has demonstrated great potential in learning visual representations. Despite their success on various downstream tasks such as image classification and object detection, self-supervised pre-training for fine-grained scenarios is not fully explored. In this paper, we first point out that current contrastive methods are prone to memorizing background/foreground texture and therefore have a limitation in localizing the foreground object. Analysis suggests that learning to extract discriminative texture information and localization are equally crucial for self-supervised pre-training under fine-grained scenarios. Based on our findings, we introduce Cross-view Saliency Alignment (CVSA), a contrastive learning framework that first crops and swaps saliency regions of images as a novel view generation and then guides the model to localize on the foreground object via a cross-view alignment loss. Extensive experiments on four popular fine-grained classification benchmarks show that CVSA significantly improves the learned representation.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2021-06-30T02:56:26.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "self-supervised pre-training",
      "fine-grained recognition",
      "foreground object",
      "extract discriminative texture information",
      "fine-grained scenarios"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}