{
  "id": "2102.09318",
  "version": "v1",
  "published": "2021-02-18T13:22:59.000Z",
  "updated": "2021-02-18T13:22:59.000Z",
  "title": "Finite-Sample Analysis of Off-Policy Natural Actor-Critic Algorithm",
  "authors": [
    "Sajad Khodadadian",
    "Zaiwei Chen",
    "Siva Theja Maguluri"
  ],
  "categories": [
    "cs.LG",
    "math.OC",
    "stat.ML"
  ],
  "abstract": "In this paper, we provide finite-sample convergence guarantees for an off-policy variant of the natural actor-critic (NAC) algorithm based on Importance Sampling. In particular, we show that the algorithm converges to a global optimal policy with a sample complexity of $\\mathcal{O}(\\epsilon^{-3}\\log^2(1/\\epsilon))$ under an appropriate choice of stepsizes. In order to overcome the issue of large variance due to Importance Sampling, we propose the $Q$-trace algorithm for the critic, which is inspired by the V-trace algorithm (Espeholt et al., 2018). This enables us to explicitly control the bias and variance, and characterize the trade-off between them. As an advantage of off-policy sampling, a major feature of our result is that we do not need any additional assumptions, beyond the ergodicity of the Markov chain induced by the behavior policy.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2021-02-18T13:22:59.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "off-policy natural actor-critic algorithm",
      "finite-sample analysis",
      "finite-sample convergence guarantees",
      "global optimal policy",
      "behavior policy"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}