{
  "id": "2403.13378",
  "version": "v1",
  "published": "2024-03-20T08:21:00.000Z",
  "updated": "2024-03-20T08:21:00.000Z",
  "title": "IIDM: Image-to-Image Diffusion Model for Semantic Image Synthesis",
  "authors": [
    "Feng Liu",
    "Xiaobin-Chang"
  ],
  "comment": "6 pages, 7 figures, accetped by CVMJ 2024",
  "categories": [
    "cs.CV"
  ],
  "abstract": "Semantic image synthesis aims to generate high-quality images given semantic conditions, i.e. segmentation masks and style reference images. Existing methods widely adopt generative adversarial networks (GANs). GANs take all conditional inputs and directly synthesize images in a single forward step. In this paper, semantic image synthesis is treated as an image denoising task and is handled with a novel image-to-image diffusion model (IIDM). Specifically, the style reference is first contaminated with random noise and then progressively denoised by IIDM, guided by segmentation masks. Moreover, three techniques, refinement, color-transfer and model ensembles, are proposed to further boost the generation quality. They are plug-in inference modules and do not require additional training. Extensive experiments show that our IIDM outperforms existing state-of-the-art methods by clear margins. Further analysis is provided via detailed demonstrations. We have implemented IIDM based on the Jittor framework; code is available at https://github.com/ader47/jittor-jieke-semantic_images_synthesis.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2024-03-20T08:21:00.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "segmentation masks",
      "iidm outperforms existing state-of-the-art methods",
      "novel image-to-image diffusion model",
      "semantic image synthesis aims",
      "adopt generative adversarial networks"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 6,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}