{
  "id": "2405.12781",
  "version": "v1",
  "published": "2024-05-21T13:28:32.000Z",
  "updated": "2024-05-21T13:28:32.000Z",
  "title": "Self-Supervised Modality-Agnostic Pre-Training of Swin Transformers",
  "authors": [
    "Abhiroop Talasila",
    "Maitreya Maity",
    "U. Deva Priyakumar"
  ],
  "categories": [
    "cs.CV",
    "cs.LG"
  ],
  "abstract": "Unsupervised pre-training has emerged as a transformative paradigm, displaying remarkable advancements in various domains. However, the susceptibility to domain shift, where pre-training data distribution differs from fine-tuning, poses a significant obstacle. To address this, we augment the Swin Transformer to learn from different medical imaging modalities, enhancing downstream performance. Our model, dubbed SwinFUSE (Swin Multi-Modal Fusion for UnSupervised Enhancement), offers three key advantages: (i) it learns from both Computed Tomography (CT) and Magnetic Resonance Images (MRI) during pre-training, resulting in complementary feature representations; (ii) a domain-invariance module (DIM) that effectively highlights salient input regions, enhancing adaptability; (iii) exhibits remarkable generalizability, surpassing the confines of tasks it was initially pre-trained on. Our experiments on two publicly available 3D segmentation datasets show a modest 1-2% performance trade-off compared to single-modality models, yet significant out-performance of up to 27% on out-of-distribution modality. This substantial improvement underscores our proposed approach's practical relevance and real-world applicability. Code is available at: https://github.com/devalab/SwinFUSE",
  "revisions": [
    {
      "version": "v1",
      "updated": "2024-05-21T13:28:32.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "swin transformer",
      "self-supervised modality-agnostic pre-training",
      "effectively highlights salient input regions",
      "3d segmentation datasets",
      "substantial improvement underscores"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}