{
  "id": "2505.03703",
  "version": "v1",
  "published": "2025-05-06T17:24:41.000Z",
  "updated": "2025-05-06T17:24:41.000Z",
  "title": "Fill the Gap: Quantifying and Reducing the Modality Gap in Image-Text Representation Learning",
  "authors": [
    "François Role",
    "Sébastien Meyer",
    "Victor Amblard"
  ],
  "categories": [
    "cs.CV",
    "cs.LG"
  ],
  "abstract": "Vision-language models (VLMs) allow to embed texts and images in a shared representation space. However, it has been shown that these models are subject to a modality gap phenomenon meaning there exists a clear separation between the embeddings from one modality and another in the embedding space. While this misalignment is detrimental for downstream tasks such as multimodal retrieval, multimodal clustering or zero-shot classification, etc. no generic and practical methods have so far been proposed to assess it precisely and even reduce it. We therefore propose novel measures and effective techniques (spectral- and optimal transport-based methods) to achieve this goal. Extensive experiments conducted on several image-text datasets and models demonstrate their effectiveness and beneficial effects on downstream tasks. Our code is available at the URL provided in the paper's abstract.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2025-05-06T17:24:41.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "image-text representation learning",
      "downstream tasks",
      "vision-language models",
      "multimodal",
      "beneficial effects"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}