{
  "id": "1905.13210",
  "version": "v1",
  "published": "2019-05-30T17:53:07.000Z",
  "updated": "2019-05-30T17:53:07.000Z",
  "title": "Generalization Bounds of Stochastic Gradient Descent for Wide and Deep Neural Networks",
  "authors": [
    "Yuan Cao",
    "Quanquan Gu"
  ],
  "comment": "22 pages",
  "categories": [
    "cs.LG",
    "math.OC",
    "stat.ML"
  ],
  "abstract": "We study the training and generalization of deep neural networks (DNNs) in the over-parameterized regime, where the network width (i.e., number of hidden nodes per layer) is much larger than the number of training data points. We show that, the expected $0$-$1$ loss of a wide enough ReLU network trained with stochastic gradient descent (SGD) and random initialization can be bounded by the training loss of a random feature model induced by the network gradient at initialization, which we call a neural tangent random feature (NTRF) model. For data distributions that can be classified by NTRF model with sufficiently small error, our result yields a generalization error bound in the order of $\\tilde{\\mathcal{O}}(n^{-1/2})$ that is independent of the network width. Our result is more general and sharper than many existing generalization error bounds for over-parameterized neural networks. In addition, we establish a strong connection between our generalization error bound and the neural tangent kernel (NTK) proposed in recent work.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2019-05-30T17:53:07.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "stochastic gradient descent",
      "deep neural networks",
      "generalization error bound",
      "generalization bounds",
      "network width"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 22,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}