{
  "id": "2205.10217",
  "version": "v1",
  "published": "2022-05-20T14:50:24.000Z",
  "updated": "2022-05-20T14:50:24.000Z",
  "title": "Memorization and Optimization in Deep Neural Networks with Minimum Over-parameterization",
  "authors": [
    "Simone Bombari",
    "Mohammad Hossein Amani",
    "Marco Mondelli"
  ],
  "categories": [
    "stat.ML",
    "cs.IT",
    "cs.LG",
    "math.IT"
  ],
  "abstract": "The Neural Tangent Kernel (NTK) has emerged as a powerful tool to provide memorization, optimization and generalization guarantees in deep neural networks. A line of work has studied the NTK spectrum for two-layer and deep networks with at least a layer with $\\Omega(N)$ neurons, $N$ being the number of training samples. Furthermore, there is increasing evidence suggesting that deep networks with sub-linear layer widths are powerful memorizers and optimizers, as long as the number of parameters exceeds the number of samples. Thus, a natural open question is whether the NTK is well conditioned in such a challenging sub-linear setup. In this paper, we answer this question in the affirmative. Our key technical contribution is a lower bound on the smallest NTK eigenvalue for deep networks with the minimum possible over-parameterization: the number of parameters is roughly $\\Omega(N)$ and, hence, the number of neurons is as little as $\\Omega(\\sqrt{N})$. To showcase the applicability of our NTK bounds, we provide two results concerning memorization capacity and optimization guarantees for gradient descent training.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2022-05-20T14:50:24.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "deep neural networks",
      "minimum over-parameterization",
      "deep networks",
      "optimization",
      "sub-linear layer widths"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}