{
  "id": "2308.12006",
  "version": "v1",
  "published": "2023-08-23T08:49:43.000Z",
  "updated": "2023-08-23T08:49:43.000Z",
  "title": "Multi-stage Factorized Spatio-Temporal Representation for RGB-D Action and Gesture Recognition",
  "authors": [
    "Yujun Ma",
    "Benjia Zhou",
    "Ruili Wang",
    "Pichao Wang"
  ],
  "comment": "ACM MM'23 has accepted this paper",
  "categories": [
    "cs.CV"
  ],
  "abstract": "RGB-D action and gesture recognition remain an interesting topic in human-centered scene understanding, primarily due to the multiple granularities and large variation in human motion. Although many RGB-D based action and gesture recognition approaches have demonstrated remarkable results by utilizing highly integrated spatio-temporal representations across multiple modalities (i.e., RGB and depth data), they still encounter several challenges. Firstly, vanilla 3D convolution makes it hard to capture fine-grained motion differences between local clips under different modalities. Secondly, the intricate nature of highly integrated spatio-temporal modeling can lead to optimization difficulties. Thirdly, duplicate and unnecessary information can add complexity and complicate entangled spatio-temporal modeling. To address the above issues, we propose an innovative heuristic architecture called Multi-stage Factorized Spatio-Temporal (MFST) for RGB-D action and gesture recognition. The proposed MFST model comprises a 3D Central Difference Convolution Stem (CDC-Stem) module and multiple factorized spatio-temporal stages. The CDC-Stem enriches fine-grained temporal perception, and the multiple hierarchical spatio-temporal stages construct dimension-independent higher-order semantic primitives. Specifically, the CDC-Stem module captures bottom-level spatio-temporal features and passes them successively to the following spatio-temporal factored stages to capture the hierarchical spatial and temporal features through the Multi- Scale Convolution and Transformer (MSC-Trans) hybrid block and Weight-shared Multi-Scale Transformer (WMS-Trans) block. The seamless integration of these innovative designs results in a robust spatio-temporal representation that outperforms state-of-the-art approaches on RGB-D action and gesture recognition datasets.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2023-08-23T08:49:43.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "gesture recognition",
      "multi-stage factorized spatio-temporal representation",
      "rgb-d action",
      "spatio-temporal stages construct",
      "captures bottom-level spatio-temporal features"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}