{
  "id": "2204.04083",
  "version": "v1",
  "published": "2022-04-08T14:01:41.000Z",
  "updated": "2022-04-08T14:01:41.000Z",
  "title": "POSTER: A Pyramid Cross-Fusion Transformer Network for Facial Expression Recognition",
  "authors": [
    "Ce Zheng",
    "Matias Mendieta",
    "Chen Chen"
  ],
  "categories": [
    "cs.CV",
    "cs.AI"
  ],
  "abstract": "Facial Expression Recognition (FER) has received increasing interest in the computer vision community. As a challenging task, there are three key issues especially prevalent in FER: inter-class similarity, intra-class discrepancy, and scale sensitivity. Existing methods typically address some of these issues, but do not tackle them all in a unified framework. Therefore, in this paper, we propose a two-stream Pyramid crOss-fuSion TransformER network (POSTER) that aims to holistically solve these issues. Specifically, we design a transformer-based cross-fusion paradigm that enables effective collaboration of facial landmark and direct image features to maximize proper attention to salient facial regions. Furthermore, POSTER employs a pyramid structure to promote scale invariance. Extensive experimental results demonstrate that our POSTER outperforms SOTA methods on RAF-DB with 92.05%, FERPlus with 91.62%, AffectNet (7 cls) with 67.31%, and AffectNet (8 cls) with 63.34%, respectively.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2022-04-08T14:01:41.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "facial expression recognition",
      "two-stream pyramid cross-fusion transformer network",
      "poster outperforms sota methods",
      "experimental results demonstrate",
      "promote scale invariance"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}