{
  "id": "2305.07304",
  "version": "v1",
  "published": "2023-05-12T08:19:39.000Z",
  "updated": "2023-05-12T08:19:39.000Z",
  "title": "CLIP-Count: Towards Text-Guided Zero-Shot Object Counting",
  "authors": [
    "Ruixiang Jiang",
    "Lingbo Liu",
    "Changwen Chen"
  ],
  "comment": "Under review",
  "categories": [
    "cs.CV",
    "cs.AI"
  ],
  "abstract": "Recent advances in visual-language models have shown remarkable zero-shot text-image matching ability that is transferable to down-stream tasks such as object detection and segmentation. However, adapting these models for object counting, which involves estimating the number of objects in an image, remains a formidable challenge. In this study, we conduct the first exploration of transferring visual-language models for class-agnostic object counting. Specifically, we propose CLIP-Count, a novel pipeline that estimates density maps for open-vocabulary objects with text guidance in a zero-shot manner, without requiring any finetuning on specific object classes. To align the text embedding with dense image features, we introduce a patch-text contrastive loss that guides the model to learn informative patch-level image representations for dense prediction. Moreover, we design a hierarchical patch-text interaction module that propagates semantic information across different resolution levels of image features. Benefiting from the full exploitation of the rich image-text alignment knowledge of pretrained visual-language models, our method effectively generates high-quality density maps for objects-of-interest. Extensive experiments on FSC-147, CARPK, and ShanghaiTech crowd counting datasets demonstrate that our proposed method achieves state-of-the-art accuracy and generalizability for zero-shot object counting. Project page at https://github.com/songrise/CLIP-Count",
  "revisions": [
    {
      "version": "v1",
      "updated": "2023-05-12T08:19:39.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "text-guided zero-shot object counting",
      "generates high-quality density maps",
      "informative patch-level image representations",
      "crowd counting datasets demonstrate",
      "effectively generates high-quality density"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}