{ "id": "2006.11144", "version": "v1", "published": "2020-06-19T14:11:26.000Z", "updated": "2020-06-19T14:11:26.000Z", "title": "On the Almost Sure Convergence of Stochastic Gradient Descent in Non-Convex Problems", "authors": [ "Panayotis Mertikopoulos", "Nadav Hallak", "Ali Kavis", "Volkan Cevher" ], "comment": "32 pages, 8 figures", "categories": [ "math.OC", "cs.LG", "math.PR", "stat.ML" ], "abstract": "This paper analyzes the trajectories of stochastic gradient descent (SGD) to help understand the algorithm's convergence properties in non-convex problems. We first show that the sequence of iterates generated by SGD remains bounded and converges with probability $1$ under a very broad range of step-size schedules. Subsequently, going beyond existing positive probability guarantees, we show that SGD avoids strict saddle points/manifolds with probability $1$ for the entire spectrum of step-size policies considered. Finally, we prove that the algorithm's rate of convergence to Hurwicz minimizers is $\\mathcal{O}(1/n^{p})$ if the method is employed with a $\\Theta(1/n^p)$ step-size schedule. This provides an important guideline for tuning the algorithm's step-size as it suggests that a cool-down phase with a vanishing step-size could lead to faster convergence; we demonstrate this heuristic using ResNet architectures on CIFAR.", "revisions": [ { "version": "v1", "updated": "2020-06-19T14:11:26.000Z" } ], "analyses": { "subjects": [ "90C26", "62L20", "90C30", "90C15", "37N40" ], "keywords": [ "stochastic gradient descent", "non-convex problems", "sure convergence", "sgd avoids strict saddle points/manifolds", "algorithms convergence properties" ], "note": { "typesetting": "TeX", "pages": 32, "language": "en", "license": "arXiv", "status": "editable" } } }