{
  "id": "2006.11144",
  "version": "v1",
  "published": "2020-06-19T14:11:26.000Z",
  "updated": "2020-06-19T14:11:26.000Z",
  "title": "On the Almost Sure Convergence of Stochastic Gradient Descent in Non-Convex Problems",
  "authors": [
    "Panayotis Mertikopoulos",
    "Nadav Hallak",
    "Ali Kavis",
    "Volkan Cevher"
  ],
  "comment": "32 pages, 8 figures",
  "categories": [
    "math.OC",
    "cs.LG",
    "math.PR",
    "stat.ML"
  ],
  "abstract": "This paper analyzes the trajectories of stochastic gradient descent (SGD) to help understand the algorithm's convergence properties in non-convex problems. We first show that the sequence of iterates generated by SGD remains bounded and converges with probability $1$ under a very broad range of step-size schedules. Subsequently, going beyond existing positive probability guarantees, we show that SGD avoids strict saddle points/manifolds with probability $1$ for the entire spectrum of step-size policies considered. Finally, we prove that the algorithm's rate of convergence to Hurwicz minimizers is $\\mathcal{O}(1/n^{p})$ if the method is employed with a $\\Theta(1/n^p)$ step-size schedule. This provides an important guideline for tuning the algorithm's step-size as it suggests that a cool-down phase with a vanishing step-size could lead to faster convergence; we demonstrate this heuristic using ResNet architectures on CIFAR.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2020-06-19T14:11:26.000Z"
    }
  ],
  "analyses": {
    "subjects": [
      "90C26",
      "62L20",
      "90C30",
      "90C15",
      "37N40"
    ],
    "keywords": [
      "stochastic gradient descent",
      "non-convex problems",
      "sure convergence",
      "sgd avoids strict saddle points/manifolds",
      "algorithms convergence properties"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 32,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}