{ "id": "2505.03703", "version": "v1", "published": "2025-05-06T17:24:41.000Z", "updated": "2025-05-06T17:24:41.000Z", "title": "Fill the Gap: Quantifying and Reducing the Modality Gap in Image-Text Representation Learning", "authors": [ "François Role", "Sébastien Meyer", "Victor Amblard" ], "categories": [ "cs.CV", "cs.LG" ], "abstract": "Vision-language models (VLMs) allow to embed texts and images in a shared representation space. However, it has been shown that these models are subject to a modality gap phenomenon meaning there exists a clear separation between the embeddings from one modality and another in the embedding space. While this misalignment is detrimental for downstream tasks such as multimodal retrieval, multimodal clustering or zero-shot classification, etc. no generic and practical methods have so far been proposed to assess it precisely and even reduce it. We therefore propose novel measures and effective techniques (spectral- and optimal transport-based methods) to achieve this goal. Extensive experiments conducted on several image-text datasets and models demonstrate their effectiveness and beneficial effects on downstream tasks. Our code is available at the URL provided in the paper's abstract.", "revisions": [ { "version": "v1", "updated": "2025-05-06T17:24:41.000Z" } ], "analyses": { "keywords": [ "image-text representation learning", "downstream tasks", "vision-language models", "multimodal", "beneficial effects" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }