{
  "id": "2310.02572",
  "version": "v1",
  "published": "2023-10-04T04:18:01.000Z",
  "updated": "2023-10-04T04:18:01.000Z",
  "title": "Improving Knowledge Distillation with Teacher's Explanation",
  "authors": [
    "Sayantan Chowdhury",
    "Ben Liang",
    "Ali Tizghadam",
    "Ilijc Albanese"
  ],
  "categories": [
    "cs.LG"
  ],
  "abstract": "Knowledge distillation (KD) improves the performance of a low-complexity student model with the help of a more powerful teacher. The teacher in KD is a black-box model, imparting knowledge to the student only through its predictions. This limits the amount of transferred knowledge. In this work, we introduce a novel Knowledge Explaining Distillation (KED) framework, which allows the student to learn not only from the teacher's predictions but also from the teacher's explanations. We propose a class of superfeature-explaining teachers that provide explanation over groups of features, along with the corresponding student model. We also present a method for constructing the superfeatures. We then extend KED to reduce complexity in convolutional neural networks, to allow augmentation with hidden-representation distillation methods, and to work with a limited amount of training data using chimeric sets. Our experiments over a variety of datasets show that KED students can substantially outperform KD students of similar complexity.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2023-10-04T04:18:01.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "improving knowledge distillation",
      "teachers explanation",
      "substantially outperform kd students",
      "low-complexity student model",
      "hidden-representation distillation methods"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}