{ "id": "2312.04027", "version": "v1", "published": "2023-12-07T03:53:17.000Z", "updated": "2023-12-07T03:53:17.000Z", "title": "The sample complexity of multi-distribution learning", "authors": [ "Binghui Peng" ], "categories": [ "cs.LG", "cs.AI", "cs.DS", "stat.ML" ], "abstract": "Multi-distribution learning generalizes the classic PAC learning to handle data coming from multiple distributions. Given a set of $k$ data distributions and a hypothesis class of VC dimension $d$, the goal is to learn a hypothesis that minimizes the maximum population loss over $k$ distributions, up to $\\epsilon$ additive error. In this paper, we settle the sample complexity of multi-distribution learning by giving an algorithm of sample complexity $\\widetilde{O}((d+k)\\epsilon^{-2}) \\cdot (k/\\epsilon)^{o(1)}$. This matches the lower bound up to sub-polynomial factor and resolves the COLT 2023 open problem of Awasthi, Haghtalab and Zhao [AHZ23].", "revisions": [ { "version": "v1", "updated": "2023-12-07T03:53:17.000Z" } ], "analyses": { "keywords": [ "sample complexity", "maximum population loss", "hypothesis class", "multiple distributions", "data distributions" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }