{
  "id": "1004.5229",
  "version": "v3",
  "published": "2010-04-29T09:31:55.000Z",
  "updated": "2010-10-13T10:11:39.000Z",
  "title": "Optimism in Reinforcement Learning and Kullback-Leibler Divergence",
  "authors": [
    "Sarah Filippi",
    "Olivier Cappé",
    "Aurélien Garivier"
  ],
  "comment": "This work has been accepted and presented at ALLERTON 2010; Communication, Control, and Computing (Allerton), 2010 48th Annual Allerton Conference on, Monticello (Illinois) : \\'Etats-Unis (2010)",
  "doi": "10.1109/ALLERTON.2010.5706896",
  "categories": [
    "cs.LG",
    "math.ST",
    "stat.ML",
    "stat.TH"
  ],
  "abstract": "We consider model-based reinforcement learning in finite Markov De- cision Processes (MDPs), focussing on so-called optimistic strategies. In MDPs, optimism can be implemented by carrying out extended value it- erations under a constraint of consistency with the estimated model tran- sition probabilities. The UCRL2 algorithm by Auer, Jaksch and Ortner (2009), which follows this strategy, has recently been shown to guarantee near-optimal regret bounds. In this paper, we strongly argue in favor of using the Kullback-Leibler (KL) divergence for this purpose. By studying the linear maximization problem under KL constraints, we provide an ef- ficient algorithm, termed KL-UCRL, for solving KL-optimistic extended value iteration. Using recent deviation bounds on the KL divergence, we prove that KL-UCRL provides the same guarantees as UCRL2 in terms of regret. However, numerical experiments on classical benchmarks show a significantly improved behavior, particularly when the MDP has reduced connectivity. To support this observation, we provide elements of com- parison between the two algorithms based on geometric considerations.",
  "revisions": [
    {
      "version": "v3",
      "updated": "2010-10-13T10:11:39.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "reinforcement learning",
      "kullback-leibler divergence",
      "guarantee near-optimal regret bounds",
      "linear maximization problem",
      "solving kl-optimistic extended value iteration"
    ],
    "tags": [
      "conference paper",
      "journal article"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable",
      "adsabs": "2010arXiv1004.5229F"
    }
  }
}