{
  "id": "2311.05844",
  "version": "v1",
  "published": "2023-09-25T13:46:00.000Z",
  "updated": "2023-09-25T13:46:00.000Z",
  "title": "Face-StyleSpeech: Improved Face-to-Voice latent mapping for Natural Zero-shot Speech Synthesis from a Face Image",
  "authors": [
    "Minki Kang",
    "Wooseok Han",
    "Eunho Yang"
  ],
  "comment": "Submitted to ICASSP 2024",
  "categories": [
    "cs.CV",
    "cs.AI",
    "cs.CL",
    "cs.MM",
    "cs.SD",
    "eess.AS"
  ],
  "abstract": "Generating a voice from a face image is crucial for developing virtual humans capable of interacting using their unique voices, without relying on pre-recorded human speech. In this paper, we propose Face-StyleSpeech, a zero-shot Text-To-Speech (TTS) synthesis model that generates natural speech conditioned on a face image rather than reference speech. We hypothesize that learning both speaker identity and prosody from a face image poses a significant challenge. To address the issue, our TTS model incorporates both a face encoder and a prosody encoder. The prosody encoder is specifically designed to model prosodic features that are not captured only with a face image, allowing the face encoder to focus solely on capturing the speaker identity from the face image. Experimental results demonstrate that Face-StyleSpeech effectively generates more natural speech from a face image than baselines, even for the face images the model has not trained. Samples are at our demo page https://face-stylespeech.github.io.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2023-09-25T13:46:00.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "face image",
      "natural zero-shot speech synthesis",
      "face-to-voice latent mapping",
      "face-stylespeech",
      "natural speech"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}