{ "id": "2305.07304", "version": "v1", "published": "2023-05-12T08:19:39.000Z", "updated": "2023-05-12T08:19:39.000Z", "title": "CLIP-Count: Towards Text-Guided Zero-Shot Object Counting", "authors": [ "Ruixiang Jiang", "Lingbo Liu", "Changwen Chen" ], "comment": "Under review", "categories": [ "cs.CV", "cs.AI" ], "abstract": "Recent advances in visual-language models have shown remarkable zero-shot text-image matching ability that is transferable to down-stream tasks such as object detection and segmentation. However, adapting these models for object counting, which involves estimating the number of objects in an image, remains a formidable challenge. In this study, we conduct the first exploration of transferring visual-language models for class-agnostic object counting. Specifically, we propose CLIP-Count, a novel pipeline that estimates density maps for open-vocabulary objects with text guidance in a zero-shot manner, without requiring any finetuning on specific object classes. To align the text embedding with dense image features, we introduce a patch-text contrastive loss that guides the model to learn informative patch-level image representations for dense prediction. Moreover, we design a hierarchical patch-text interaction module that propagates semantic information across different resolution levels of image features. Benefiting from the full exploitation of the rich image-text alignment knowledge of pretrained visual-language models, our method effectively generates high-quality density maps for objects-of-interest. Extensive experiments on FSC-147, CARPK, and ShanghaiTech crowd counting datasets demonstrate that our proposed method achieves state-of-the-art accuracy and generalizability for zero-shot object counting. Project page at https://github.com/songrise/CLIP-Count", "revisions": [ { "version": "v1", "updated": "2023-05-12T08:19:39.000Z" } ], "analyses": { "keywords": [ "text-guided zero-shot object counting", "generates high-quality density maps", "informative patch-level image representations", "crowd counting datasets demonstrate", "effectively generates high-quality density" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }