{
  "id": "2112.10992",
  "version": "v2",
  "published": "2021-12-21T05:31:51.000Z",
  "updated": "2022-04-24T09:18:20.000Z",
  "title": "Expansion-Squeeze-Excitation Fusion Network for Elderly Activity Recognition",
  "authors": [
    "Xiangbo Shu",
    "Jiawen Yang",
    "Rui Yan",
    "Yan Song"
  ],
  "categories": [
    "cs.CV",
    "stat.ML"
  ],
  "abstract": "This work focuses on the task of elderly activity recognition, which is a challenging task due to the existence of individual actions and human-object interactions in elderly activities. Thus, we attempt to effectively aggregate the discriminative information of actions and interactions from both RGB videos and skeleton sequences by attentively fusing multi-modal features. Recently, some nonlinear multi-modal fusion approaches are proposed by utilizing nonlinear attention mechanism that is extended from Squeeze-and-Excitation Networks (SENet). Inspired by this, we propose a novel Expansion-Squeeze-Excitation Fusion Network (ESE-FN) to effectively address the problem of elderly activity recognition, which learns modal and channel-wise Expansion-Squeeze-Excitation (ESE) attentions for attentively fusing the multi-modal features in the modal and channel-wise ways. Furthermore, we design a new Multi-modal Loss (ML) to keep the consistency between the single-modal features and the fused multi-modal features by adding the penalty of difference between the minimum prediction losses on single modalities and the prediction loss on the fused modality. Finally, we conduct experiments on a largest-scale elderly activity dataset, i.e., ETRI-Activity3D (including 110,000+ videos, and 50+ categories), to demonstrate that the proposed ESE-FN achieves the best accuracy compared with the state-of-the-art methods. In addition, more extensive experimental results show that the proposed ESE-FN is also comparable to the other methods in terms of normal action recognition task.",
  "revisions": [
    {
      "version": "v2",
      "updated": "2022-04-24T09:18:20.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "elderly activity recognition",
      "multi-modal features",
      "novel expansion-squeeze-excitation fusion network",
      "nonlinear multi-modal fusion approaches",
      "normal action recognition task"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}