{ "id": "2104.10157", "version": "v1", "published": "2021-04-20T17:58:03.000Z", "updated": "2021-04-20T17:58:03.000Z", "title": "VideoGPT: Video Generation using VQ-VAE and Transformers", "authors": [ "Wilson Yan", "Yunzhi Zhang", "Pieter Abbeel", "Aravind Srinivas" ], "comment": "Project website: https://wilson1yan.github.io/videogpt/index.html", "categories": [ "cs.CV", "cs.LG" ], "abstract": "We present VideoGPT: a conceptually simple architecture for scaling likelihood based generative modeling to natural videos. VideoGPT uses VQ-VAE that learns downsampled discrete latent representations of a raw video by employing 3D convolutions and axial self-attention. A simple GPT-like architecture is then used to autoregressively model the discrete latents using spatio-temporal position encodings. Despite the simplicity in formulation and ease of training, our architecture is able to generate samples competitive with state-of-the-art GAN models for video generation on the BAIR Robot dataset, and generate high fidelity natural images from UCF-101 and Tumbler GIF Dataset (TGIF). We hope our proposed architecture serves as a reproducible reference for a minimalistic implementation of transformer based video generation models. Samples and code are available at https://wilson1yan.github.io/videogpt/index.html", "revisions": [ { "version": "v1", "updated": "2021-04-20T17:58:03.000Z" } ], "analyses": { "keywords": [ "video generation", "transformer", "generate high fidelity natural images", "learns downsampled discrete latent representations", "architecture" ], "tags": [ "github project" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }