{
  "id": "2405.10761",
  "version": "v1",
  "published": "2024-05-17T13:17:48.000Z",
  "updated": "2024-05-17T13:17:48.000Z",
  "title": "Critical feature learning in deep neural networks",
  "authors": [
    "Kirsten Fischer",
    "Javed Lindner",
    "David Dahmen",
    "Zohar Ringel",
    "Michael Krämer",
    "Moritz Helias"
  ],
  "comment": "31 pages, 7 figures, accepted at International Conference on Machine Learning 2024",
  "categories": [
    "cond-mat.dis-nn"
  ],
  "abstract": "A key property of neural networks driving their success is their ability to learn features from data. Understanding feature learning from a theoretical viewpoint is an emerging field with many open questions. In this work we capture finite-width effects with a systematic theory of network kernels in deep non-linear neural networks. We show that the Bayesian prior of the network can be written in closed form as a superposition of Gaussian processes, whose kernels are distributed with a variance that depends inversely on the network width N . A large deviation approach, which is exact in the proportional limit for the number of data points $P = \\alpha N \\rightarrow \\infty$, yields a pair of forward-backward equations for the maximum a posteriori kernels in all layers at once. We study their solutions perturbatively to demonstrate how the backward propagation across layers aligns kernels with the target. An alternative field-theoretic formulation shows that kernel adaptation of the Bayesian posterior at finite-width results from fluctuations in the prior: larger fluctuations correspond to a more flexible network prior and thus enable stronger adaptation to data. We thus find a bridge between the classical edge-of-chaos NNGP theory and feature learning, exposing an intricate interplay between criticality, response functions, and feature scale.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2024-05-17T13:17:48.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "deep neural networks",
      "critical feature learning",
      "deep non-linear neural networks",
      "capture finite-width effects",
      "large deviation approach"
    ],
    "tags": [
      "conference paper"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 31,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}