{
  "id": "2312.03045",
  "version": "v1",
  "published": "2023-12-05T16:54:42.000Z",
  "updated": "2023-12-05T16:54:42.000Z",
  "title": "Customization Assistant for Text-to-image Generation",
  "authors": [
    "Yufan Zhou",
    "Ruiyi Zhang",
    "Jiuxiang Gu",
    "Tong Sun"
  ],
  "categories": [
    "cs.CV"
  ],
  "abstract": "Customizing pre-trained text-to-image generation model has attracted massive research interest recently, due to its huge potential in real-world applications. Although existing methods are able to generate creative content for a novel concept contained in single user-input image, their capability are still far from perfection. Specifically, most existing methods require fine-tuning the generative model on testing images. Some existing methods do not require fine-tuning, while their performance are unsatisfactory. Furthermore, the interaction between users and models are still limited to directive and descriptive prompts such as instructions and captions. In this work, we build a customization assistant based on pre-trained large language model and diffusion model, which can not only perform customized generation in a tuning-free manner, but also enable more user-friendly interactions: users can chat with the assistant and input either ambiguous text or clear instruction. Specifically, we propose a new framework consists of a new model design and a novel training strategy. The resulting assistant can perform customized generation in 2-5 seconds without any test time fine-tuning. Extensive experiments are conducted, competitive results have been obtained across different domains, illustrating the effectiveness of the proposed method.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2023-12-05T16:54:42.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "customization assistant",
      "existing methods",
      "perform customized generation",
      "customizing pre-trained text-to-image generation model",
      "single user-input image"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}