{
  "id": "2312.04027",
  "version": "v1",
  "published": "2023-12-07T03:53:17.000Z",
  "updated": "2023-12-07T03:53:17.000Z",
  "title": "The sample complexity of multi-distribution learning",
  "authors": [
    "Binghui Peng"
  ],
  "categories": [
    "cs.LG",
    "cs.AI",
    "cs.DS",
    "stat.ML"
  ],
  "abstract": "Multi-distribution learning generalizes the classic PAC learning to handle data coming from multiple distributions. Given a set of $k$ data distributions and a hypothesis class of VC dimension $d$, the goal is to learn a hypothesis that minimizes the maximum population loss over $k$ distributions, up to $\\epsilon$ additive error. In this paper, we settle the sample complexity of multi-distribution learning by giving an algorithm of sample complexity $\\widetilde{O}((d+k)\\epsilon^{-2}) \\cdot (k/\\epsilon)^{o(1)}$. This matches the lower bound up to sub-polynomial factor and resolves the COLT 2023 open problem of Awasthi, Haghtalab and Zhao [AHZ23].",
  "revisions": [
    {
      "version": "v1",
      "updated": "2023-12-07T03:53:17.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "sample complexity",
      "maximum population loss",
      "hypothesis class",
      "multiple distributions",
      "data distributions"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}