{
  "id": "2105.12540",
  "version": "v1",
  "published": "2021-05-26T13:35:42.000Z",
  "updated": "2021-05-26T13:35:42.000Z",
  "title": "Finite-Sample Analysis of Off-Policy Natural Actor-Critic with Linear Function Approximation",
  "authors": [
    "Zaiwei Chen",
    "Sajad Khodadadian",
    "Siva Theja Maguluri"
  ],
  "categories": [
    "cs.LG",
    "math.OC"
  ],
  "abstract": "In this paper, we develop a novel variant of off-policy natural actor-critic algorithm with linear function approximation and we establish a sample complexity of $\\mathcal{O}(\\epsilon^{-3})$, outperforming all the previously known convergence bounds of such algorithms. In order to overcome the divergence due to deadly triad in off-policy policy evaluation under function approximation, we develop a critic that employs $n$-step TD-learning algorithm with a properly chosen $n$. We present finite-sample convergence bounds on this critic under both constant and diminishing step sizes, which are of independent interest. Furthermore, we develop a variant of natural policy gradient under function approximation, with an improved convergence rate of $\\mathcal{O}(1/T)$ after $T$ iterations. Combining the finite sample error bounds of actor and the critic, we obtain the $\\mathcal{O}(\\epsilon^{-3})$ sample complexity. We derive our sample complexity bounds solely based on the assumption that the behavior policy sufficiently explores all the states and actions, which is a much lighter assumption compared to the related literature.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2021-05-26T13:35:42.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "linear function approximation",
      "finite-sample analysis",
      "off-policy natural actor-critic algorithm",
      "convergence bounds",
      "finite sample error bounds"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}