{
  "id": "2302.08783",
  "version": "v1",
  "published": "2023-02-17T09:46:08.000Z",
  "updated": "2023-02-17T09:46:08.000Z",
  "title": "SGD with AdaGrad Stepsizes: Full Adaptivity with High Probability to Unknown Parameters, Unbounded Gradients and Affine Variance",
  "authors": [
    "Amit Attia",
    "Tomer Koren"
  ],
  "comment": "25 pages",
  "categories": [
    "cs.LG",
    "math.OC",
    "stat.ML"
  ],
  "abstract": "We study Stochastic Gradient Descent with AdaGrad stepsizes: a popular adaptive (self-tuning) method for first-order stochastic optimization. Despite being well studied, existing analyses of this method suffer from various shortcomings: they either assume some knowledge of the problem parameters, impose strong global Lipschitz conditions, or fail to give bounds that hold with high probability. We provide a comprehensive analysis of this basic method without any of these limitations, in both the convex and non-convex (smooth) cases, that additionally supports a general ``affine variance'' noise model and provides sharp rates of convergence in both the low-noise and high-noise~regimes.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2023-02-17T09:46:08.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "affine variance",
      "high probability",
      "adagrad stepsizes",
      "unknown parameters",
      "full adaptivity"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 25,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}