{
  "id": "2406.20081",
  "version": "v1",
  "published": "2024-06-28T17:47:32.000Z",
  "updated": "2024-06-28T17:47:32.000Z",
  "title": "Segment Anything without Supervision",
  "authors": [
    "XuDong Wang",
    "Jingfeng Yang",
    "Trevor Darrell"
  ],
  "comment": "Code: https://github.com/frank-xwang/UnSAM",
  "categories": [
    "cs.CV",
    "cs.LG"
  ],
  "abstract": "The Segmentation Anything Model (SAM) requires labor-intensive data labeling. We present Unsupervised SAM (UnSAM) for promptable and automatic whole-image segmentation that does not require human annotations. UnSAM utilizes a divide-and-conquer strategy to \"discover\" the hierarchical structure of visual scenes. We first leverage top-down clustering methods to partition an unlabeled image into instance/semantic level segments. For all pixels within a segment, a bottom-up clustering method is employed to iteratively merge them into larger groups, thereby forming a hierarchical structure. These unsupervised multi-granular masks are then utilized to supervise model training. Evaluated across seven popular datasets, UnSAM achieves competitive results with the supervised counterpart SAM, and surpasses the previous state-of-the-art in unsupervised segmentation by 11% in terms of AR. Moreover, we show that supervised SAM can also benefit from our self-supervised labels. By integrating our unsupervised pseudo masks into SA-1B's ground-truth masks and training UnSAM with only 1% of SA-1B, a lightly semi-supervised UnSAM can often segment entities overlooked by supervised SAM, exceeding SAM's AR by over 6.7% and AP by 3.9% on SA-1B.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2024-06-28T17:47:32.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "first leverage top-down clustering methods",
      "supervision",
      "automatic whole-image segmentation",
      "hierarchical structure",
      "seven popular datasets"
    ],
    "tags": [
      "github project"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}