{ "id": "2303.14358", "version": "v1", "published": "2023-03-25T04:47:31.000Z", "updated": "2023-03-25T04:47:31.000Z", "title": "Multi-view knowledge distillation transformer for human action recognition", "authors": [ "Ying-Chen Lin", "Vincent S. Tseng" ], "categories": [ "cs.CV" ], "abstract": "Recently, Transformer-based methods have been utilized to improve the performance of human action recognition. However, most of these studies assume that multi-view data is complete, which may not always be the case in real-world scenarios. Therefore, this paper presents a novel Multi-view Knowledge Distillation Transformer (MKDT) framework that consists of a teacher network and a student network. This framework aims to handle incomplete human action problems in real-world applications. Specifically, the multi-view knowledge distillation transformer uses a hierarchical vision transformer with shifted windows to capture more spatial-temporal information. Experimental results demonstrate that our framework outperforms the CNN-based method on three public datasets.", "revisions": [ { "version": "v1", "updated": "2023-03-25T04:47:31.000Z" } ], "analyses": { "keywords": [ "human action recognition", "handle incomplete human action problems", "novel multi-view knowledge distillation transformer" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }