{ "id": "2311.05844", "version": "v1", "published": "2023-09-25T13:46:00.000Z", "updated": "2023-09-25T13:46:00.000Z", "title": "Face-StyleSpeech: Improved Face-to-Voice latent mapping for Natural Zero-shot Speech Synthesis from a Face Image", "authors": [ "Minki Kang", "Wooseok Han", "Eunho Yang" ], "comment": "Submitted to ICASSP 2024", "categories": [ "cs.CV", "cs.AI", "cs.CL", "cs.MM", "cs.SD", "eess.AS" ], "abstract": "Generating a voice from a face image is crucial for developing virtual humans capable of interacting using their unique voices, without relying on pre-recorded human speech. In this paper, we propose Face-StyleSpeech, a zero-shot Text-To-Speech (TTS) synthesis model that generates natural speech conditioned on a face image rather than reference speech. We hypothesize that learning both speaker identity and prosody from a face image poses a significant challenge. To address the issue, our TTS model incorporates both a face encoder and a prosody encoder. The prosody encoder is specifically designed to model prosodic features that are not captured only with a face image, allowing the face encoder to focus solely on capturing the speaker identity from the face image. Experimental results demonstrate that Face-StyleSpeech effectively generates more natural speech from a face image than baselines, even for the face images the model has not trained. Samples are at our demo page https://face-stylespeech.github.io.", "revisions": [ { "version": "v1", "updated": "2023-09-25T13:46:00.000Z" } ], "analyses": { "keywords": [ "face image", "natural zero-shot speech synthesis", "face-to-voice latent mapping", "face-stylespeech", "natural speech" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }