{
  "id": "2212.00193",
  "version": "v1",
  "published": "2022-12-01T00:39:56.000Z",
  "updated": "2022-12-01T00:39:56.000Z",
  "title": "Distilling Multi-Step Reasoning Capabilities of Large Language Models into Smaller Models via Semantic Decompositions",
  "authors": [
    "Kumar Shridhar",
    "Alessandro Stolfo",
    "Mrinmaya Sachan"
  ],
  "categories": [
    "cs.LG",
    "cs.CL"
  ],
  "abstract": "Step-by-step reasoning approaches like chain-of-thought (CoT) have proved to be a very effective technique to induce reasoning capabilities in large language models. However, the success of the CoT approach depends primarily on model size, and often billion parameter-scale models are needed to get CoT to work. In this paper, we propose a knowledge distillation approach, that leverages the step-by-step CoT reasoning capabilities of larger models and distils these reasoning abilities into smaller models. Our approach Decompositional Distillation learns a semantic decomposition of the original problem into a sequence of subproblems and uses it to train two models: a) a problem decomposer that learns to decompose the complex reasoning problem into a sequence of simpler sub-problems and b) a problem solver that uses the intermediate subproblems to solve the overall problem. On a multi-step math word problem dataset (GSM8K), we boost the performance of GPT-2 variants up to 35% when distilled with our approach compared to CoT. We show that using our approach, it is possible to train a GPT-2-large model (775M) that can outperform a 10X larger GPT-3 (6B) model trained using CoT reasoning. Finally, we also demonstrate that our approach of problem decomposition can also be used as an alternative to CoT prompting, which boosts the GPT-3 performance by 40% compared to CoT prompts.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2022-12-01T00:39:56.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "large language models",
      "distilling multi-step reasoning capabilities",
      "semantic decomposition",
      "smaller models",
      "math word problem dataset"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}