{ "id": "2211.06883", "version": "v1", "published": "2022-11-13T12:03:40.000Z", "updated": "2022-11-13T12:03:40.000Z", "title": "Generalizing distribution of partial rewards for multi-armed bandits with temporally-partitioned rewards", "authors": [ "Ronald C. van den Broek", "Rik Litjens", "Tobias Sagis", "Luc Siecker", "Nina Verbeeke", "Pratik Gajane" ], "categories": [ "cs.LG", "stat.ML" ], "abstract": "We investigate the Multi-Armed Bandit problem with Temporally-Partitioned Rewards (TP-MAB) setting in this paper. In the TP-MAB setting, an agent will receive subsets of the reward over multiple rounds rather than the entire reward for the arm all at once. In this paper, we introduce a general formulation of how an arm's cumulative reward is distributed across several rounds, called Beta-spread property. Such a generalization is needed to be able to handle partitioned rewards in which the maximum reward per round is not distributed uniformly across rounds. We derive a lower bound on the TP-MAB problem under the assumption that Beta-spread holds. Moreover, we provide an algorithm TP-UCB-FR-G, which uses the Beta-spread property to improve the regret upper bound in some scenarios. By generalizing how the cumulative reward is distributed, this setting is applicable in a broader range of applications.", "revisions": [ { "version": "v1", "updated": "2022-11-13T12:03:40.000Z" } ], "analyses": { "keywords": [ "multi-armed bandit", "temporally-partitioned rewards", "generalizing distribution", "partial rewards", "beta-spread property" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }