{
  "id": "2210.07082",
  "version": "v1",
  "published": "2022-10-13T15:09:54.000Z",
  "updated": "2022-10-13T15:09:54.000Z",
  "title": "Implicit Bias in Leaky ReLU Networks Trained on High-Dimensional Data",
  "authors": [
    "Spencer Frei",
    "Gal Vardi",
    "Peter L. Bartlett",
    "Nathan Srebro",
    "Wei Hu"
  ],
  "comment": "54 pages",
  "categories": [
    "cs.LG",
    "stat.ML"
  ],
  "abstract": "The implicit biases of gradient-based optimization algorithms are conjectured to be a major factor in the success of modern deep learning. In this work, we investigate the implicit bias of gradient flow and gradient descent in two-layer fully-connected neural networks with leaky ReLU activations when the training data are nearly-orthogonal, a common property of high-dimensional data. For gradient flow, we leverage recent work on the implicit bias for homogeneous neural networks to show that asymptotically, gradient flow produces a neural network with rank at most two. Moreover, this network is an $\\ell_2$-max-margin solution (in parameter space), and has a linear decision boundary that corresponds to an approximate-max-margin linear predictor. For gradient descent, provided the random initialization variance is small enough, we show that a single step of gradient descent suffices to drastically reduce the rank of the network, and that the rank remains small throughout training. We provide experiments which suggest that a small initialization scale is important for finding low-rank neural networks with gradient descent.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2022-10-13T15:09:54.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "implicit bias",
      "leaky relu networks",
      "high-dimensional data",
      "neural network",
      "gradient descent"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 54,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}