{ "id": "2302.08783", "version": "v1", "published": "2023-02-17T09:46:08.000Z", "updated": "2023-02-17T09:46:08.000Z", "title": "SGD with AdaGrad Stepsizes: Full Adaptivity with High Probability to Unknown Parameters, Unbounded Gradients and Affine Variance", "authors": [ "Amit Attia", "Tomer Koren" ], "comment": "25 pages", "categories": [ "cs.LG", "math.OC", "stat.ML" ], "abstract": "We study Stochastic Gradient Descent with AdaGrad stepsizes: a popular adaptive (self-tuning) method for first-order stochastic optimization. Despite being well studied, existing analyses of this method suffer from various shortcomings: they either assume some knowledge of the problem parameters, impose strong global Lipschitz conditions, or fail to give bounds that hold with high probability. We provide a comprehensive analysis of this basic method without any of these limitations, in both the convex and non-convex (smooth) cases, that additionally supports a general ``affine variance'' noise model and provides sharp rates of convergence in both the low-noise and high-noise~regimes.", "revisions": [ { "version": "v1", "updated": "2023-02-17T09:46:08.000Z" } ], "analyses": { "keywords": [ "affine variance", "high probability", "adagrad stepsizes", "unknown parameters", "full adaptivity" ], "note": { "typesetting": "TeX", "pages": 25, "language": "en", "license": "arXiv", "status": "editable" } } }