{
  "id": "1912.05523",
  "version": "v1",
  "published": "2019-12-11T18:46:53.000Z",
  "updated": "2019-12-11T18:46:53.000Z",
  "title": "$\\mathbf{G^{3}AN}$: This video does not exist. Disentangling motion and appearance for video generation",
  "authors": [
    "Yaohui Wang",
    "Piotr Bilinski",
    "Francois Bremond",
    "Antitza Dantcheva"
  ],
  "categories": [
    "cs.CV"
  ],
  "abstract": "Creating realistic human videos introduces the challenge of being able to simultaneously generate both appearance, as well as motion. To tackle this challenge, we propose the novel spatio-temporal GAN-architecture $G^3AN$, which seeks to capture the distribution of high dimensional video data and to model appearance and motion in disentangled manner. The latter is achieved by decomposing appearance and motion in a three-stream Generator, where the main stream aims to model spatio-temporal consistency, whereas the two auxiliary streams augment the main stream with multi-scale appearance and motion features, respectively. An extensive quantitative and qualitative analysis shows that our model systematically and significantly outperforms state-of-the-art methods on the facial expression datasets MUG and UvA-NEMO, as well as the Weizmann and UCF101 datasets on human action. Additional analysis on the learned latent representations confirms the successful decomposition of appearance and motion.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2019-12-11T18:46:53.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "appearance",
      "video generation",
      "disentangling motion",
      "facial expression datasets mug",
      "high dimensional video data"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}