{ "id": "2112.10992", "version": "v2", "published": "2021-12-21T05:31:51.000Z", "updated": "2022-04-24T09:18:20.000Z", "title": "Expansion-Squeeze-Excitation Fusion Network for Elderly Activity Recognition", "authors": [ "Xiangbo Shu", "Jiawen Yang", "Rui Yan", "Yan Song" ], "categories": [ "cs.CV", "stat.ML" ], "abstract": "This work focuses on the task of elderly activity recognition, which is a challenging task due to the existence of individual actions and human-object interactions in elderly activities. Thus, we attempt to effectively aggregate the discriminative information of actions and interactions from both RGB videos and skeleton sequences by attentively fusing multi-modal features. Recently, some nonlinear multi-modal fusion approaches are proposed by utilizing nonlinear attention mechanism that is extended from Squeeze-and-Excitation Networks (SENet). Inspired by this, we propose a novel Expansion-Squeeze-Excitation Fusion Network (ESE-FN) to effectively address the problem of elderly activity recognition, which learns modal and channel-wise Expansion-Squeeze-Excitation (ESE) attentions for attentively fusing the multi-modal features in the modal and channel-wise ways. Furthermore, we design a new Multi-modal Loss (ML) to keep the consistency between the single-modal features and the fused multi-modal features by adding the penalty of difference between the minimum prediction losses on single modalities and the prediction loss on the fused modality. Finally, we conduct experiments on a largest-scale elderly activity dataset, i.e., ETRI-Activity3D (including 110,000+ videos, and 50+ categories), to demonstrate that the proposed ESE-FN achieves the best accuracy compared with the state-of-the-art methods. In addition, more extensive experimental results show that the proposed ESE-FN is also comparable to the other methods in terms of normal action recognition task.", "revisions": [ { "version": "v2", "updated": "2022-04-24T09:18:20.000Z" } ], "analyses": { "keywords": [ "elderly activity recognition", "multi-modal features", "novel expansion-squeeze-excitation fusion network", "nonlinear multi-modal fusion approaches", "normal action recognition task" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }