{
  "id": "2402.09401",
  "version": "v2",
  "published": "2024-02-14T18:58:40.000Z",
  "updated": "2025-02-11T18:18:59.000Z",
  "title": "Reinforcement Learning from Human Feedback with Active Queries",
  "authors": [
    "Kaixuan Ji",
    "Jiafan He",
    "Quanquan Gu"
  ],
  "comment": "28 pages, 1 figure, 4 table",
  "categories": [
    "cs.LG",
    "cs.AI",
    "cs.CL",
    "math.OC",
    "stat.ML"
  ],
  "abstract": "Aligning large language models (LLM) with human preference plays a key role in building modern generative models and can be achieved by reinforcement learning from human feedback (RLHF). Despite their superior performance, current RLHF approaches often require a large amount of human-labelled preference data, which is expensive to collect. In this paper, inspired by the success of active learning, we address this problem by proposing query-efficient RLHF methods. We first formalize the alignment problem as a contextual dueling bandit problem and design an active-query-based proximal policy optimization (APPO) algorithm with an $\\tilde{O}(d^2/\\Delta)$ instance-dependent regret bound and an $\\tilde{O}(d^2/\\Delta^2)$ query complexity, where $d$ is the dimension of feature space and $\\Delta$ is the sub-optimality gap over all the contexts. We then propose ADPO, a practical version of our algorithm based on direct preference optimization (DPO) and apply it to fine-tuning LLMs. Our experiments show that ADPO, while only making about half of queries for human preference, matches the performance of the state-of-the-art DPO method.",
  "revisions": [
    {
      "version": "v2",
      "updated": "2025-02-11T18:18:59.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "human feedback",
      "reinforcement learning",
      "active queries",
      "instance-dependent regret bound",
      "state-of-the-art dpo method"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 28,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}