{
  "id": "2303.14358",
  "version": "v1",
  "published": "2023-03-25T04:47:31.000Z",
  "updated": "2023-03-25T04:47:31.000Z",
  "title": "Multi-view knowledge distillation transformer for human action recognition",
  "authors": [
    "Ying-Chen Lin",
    "Vincent S. Tseng"
  ],
  "categories": [
    "cs.CV"
  ],
  "abstract": "Recently, Transformer-based methods have been utilized to improve the performance of human action recognition. However, most of these studies assume that multi-view data is complete, which may not always be the case in real-world scenarios. Therefore, this paper presents a novel Multi-view Knowledge Distillation Transformer (MKDT) framework that consists of a teacher network and a student network. This framework aims to handle incomplete human action problems in real-world applications. Specifically, the multi-view knowledge distillation transformer uses a hierarchical vision transformer with shifted windows to capture more spatial-temporal information. Experimental results demonstrate that our framework outperforms the CNN-based method on three public datasets.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2023-03-25T04:47:31.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "human action recognition",
      "handle incomplete human action problems",
      "novel multi-view knowledge distillation transformer"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}