{
  "id": "2502.02732",
  "version": "v1",
  "published": "2025-02-04T21:29:47.000Z",
  "updated": "2025-02-04T21:29:47.000Z",
  "title": "Peri-LN: Revisiting Layer Normalization in the Transformer Architecture",
  "authors": [
    "Jeonghoon Kim",
    "Byeongchan Lee",
    "Cheonbok Park",
    "Yeontaek Oh",
    "Beomjun Kim",
    "Taehwan Yoo",
    "Seongjin Shin",
    "Dongyoon Han",
    "Jinwoo Shin",
    "Kang Min Yoo"
  ],
  "comment": "Preprint",
  "categories": [
    "cs.LG",
    "cs.AI",
    "cs.CL"
  ],
  "abstract": "Designing Transformer architectures with the optimal layer normalization (LN) strategy that ensures large-scale training stability and expedite convergence has remained elusive, even in this era of large language models (LLMs). To this end, we present a comprehensive analytical foundation for understanding how different LN strategies influence training dynamics in large-scale Transformer training. Until recently, Pre-LN and Post-LN have long dominated standard practices despite their limitations in large-scale training. However, several open-source large-scale models have recently begun silently adopting a third strategy without much explanation. This strategy places layer normalization (LN) peripherally around sublayers, a design we term Peri-LN. While Peri-LN has demonstrated promising empirical performance, its precise mechanisms and benefits remain almost unexplored. Our in-depth analysis shows that Peri-LN strikes an ideal balance in variance growth -- unlike Pre-LN and Post-LN, which are prone to vanishing gradients and ``massive activations.'' To validate our theoretical insight, we conduct large-scale experiments on Transformers up to 3.2B parameters, showing that Peri-LN consistently achieves more balanced variance growth, steadier gradient flow, and convergence stability. Our results suggest that Peri-LN warrants broader consideration for large-scale Transformer architectures, providing renewed insights into the optimal placement and application of LN.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2025-02-04T21:29:47.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "transformer architecture",
      "revisiting layer normalization",
      "long dominated standard practices despite",
      "peri-ln warrants broader consideration",
      "large-scale transformer"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}