{
  "id": "2402.01845",
  "version": "v2",
  "published": "2024-02-02T19:02:47.000Z",
  "updated": "2024-07-15T19:17:42.000Z",
  "title": "Multi-Armed Bandits with Interference",
  "authors": [
    "Su Jia",
    "Peter Frazier",
    "Nathan Kallus"
  ],
  "categories": [
    "cs.LG",
    "stat.ML"
  ],
  "abstract": "Experimentation with interference poses a significant challenge in contemporary online platforms. Prior research on experimentation with interference has concentrated on the final output of a policy. The cumulative performance, while equally crucial, is less well understood. To address this gap, we introduce the problem of {\\em Multi-armed Bandits with Interference} (MABI), where the learner assigns an arm to each of $N$ experimental units over a time horizon of $T$ rounds. The reward of each unit in each round depends on the treatments of {\\em all} units, where the influence of a unit decays in the spatial distance between units. Furthermore, we employ a general setup wherein the reward functions are chosen by an adversary and may vary arbitrarily across rounds and units. We first show that switchback policies achieve an optimal {\\em expected} regret $\\tilde O(\\sqrt T)$ against the best fixed-arm policy. Nonetheless, the regret (as a random variable) for any switchback policy suffers a high variance, as it does not account for $N$. We propose a cluster randomization policy whose regret (i) is optimal in {\\em expectation} and (ii) admits a high probability bound that vanishes in $N$.",
  "revisions": [
    {
      "version": "v2",
      "updated": "2024-07-15T19:17:42.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "multi-armed bandits",
      "contemporary online platforms",
      "best fixed-arm policy",
      "high probability bound",
      "switchback policy suffers"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}