{
  "id": "2102.04939",
  "version": "v1",
  "published": "2021-02-09T16:49:58.000Z",
  "updated": "2021-02-09T16:49:58.000Z",
  "title": "RL for Latent MDPs: Regret Guarantees and a Lower Bound",
  "authors": [
    "Jeongyeol Kwon",
    "Yonathan Efroni",
    "Constantine Caramanis",
    "Shie Mannor"
  ],
  "categories": [
    "cs.LG"
  ],
  "abstract": "In this work, we consider the regret minimization problem for reinforcement learning in latent Markov Decision Processes (LMDP). In an LMDP, an MDP is randomly drawn from a set of $M$ possible MDPs at the beginning of the interaction, but the identity of the chosen MDP is not revealed to the agent. We first show that a general instance of LMDPs requires at least $\\Omega((SA)^M)$ episodes to even approximate the optimal policy. Then, we consider sufficient assumptions under which learning good policies requires polynomial number of episodes. We show that the key link is a notion of separation between the MDP system dynamics. With sufficient separation, we provide an efficient algorithm with local guarantee, {\\it i.e.,} providing a sublinear regret guarantee when we are given a good initialization. Finally, if we are given standard statistical sufficiency assumptions common in the Predictive State Representation (PSR) literature (e.g., Boots et al.) and a reachability assumption, we show that the need for initialization can be removed.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2021-02-09T16:49:58.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "lower bound",
      "latent mdps",
      "standard statistical sufficiency assumptions common",
      "latent markov decision processes",
      "regret minimization problem"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}