{
  "id": "1609.08976",
  "version": "v1",
  "published": "2016-09-28T15:56:15.000Z",
  "updated": "2016-09-28T15:56:15.000Z",
  "title": "Variational Autoencoder for Deep Learning of Images, Labels and Captions",
  "authors": [
    "Yunchen Pu",
    "Zhe Gan",
    "Ricardo Henao",
    "Xin Yuan",
    "Chunyuan Li",
    "Andrew Stevens",
    "Lawrence Carin"
  ],
  "comment": "NIPS 2016 (To appear)",
  "categories": [
    "stat.ML",
    "cs.LG"
  ],
  "abstract": "A novel variational autoencoder is developed to model images, as well as associated labels or captions. The Deep Generative Deconvolutional Network (DGDN) is used as a decoder of the latent image features, and a deep Convolutional Neural Network (CNN) is used as an image encoder; the CNN is used to approximate a distribution for the latent DGDN features/code. The latent code is also linked to generative models for labels (Bayesian support vector machine) or captions (recurrent neural network). When predicting a label/caption for a new image at test, averaging is performed across the distribution of latent codes; this is computationally efficient as a consequence of the learned CNN-based encoder. Since the framework is capable of modeling the image in the presence/absence of associated labels/captions, a new semi-supervised setting is manifested for CNN learning with images; the framework even allows unsupervised CNN learning, based on images alone.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2016-09-28T15:56:15.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "deep learning",
      "bayesian support vector machine",
      "latent code",
      "deep convolutional neural network",
      "deep generative deconvolutional network"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}