{
  "id": "2402.14505",
  "version": "v3",
  "published": "2024-02-22T12:55:01.000Z",
  "updated": "2024-04-03T14:59:08.000Z",
  "title": "Towards Seamless Adaptation of Pre-trained Models for Visual Place Recognition",
  "authors": [
    "Feng Lu",
    "Lijun Zhang",
    "Xiangyuan Lan",
    "Shuting Dong",
    "Yaowei Wang",
    "Chun Yuan"
  ],
  "comment": "ICLR2024",
  "categories": [
    "cs.CV",
    "cs.AI"
  ],
  "abstract": "Recent studies show that vision models pre-trained in generic visual learning tasks with large-scale data can provide useful feature representations for a wide range of visual perception problems. However, few attempts have been made to exploit pre-trained foundation models in visual place recognition (VPR). Due to the inherent difference in training objectives and data between the tasks of model pre-training and VPR, how to bridge the gap and fully unleash the capability of pre-trained models for VPR is still a key issue to address. To this end, we propose a novel method to realize seamless adaptation of pre-trained models for VPR. Specifically, to obtain both global and local features that focus on salient landmarks for discriminating places, we design a hybrid adaptation method to achieve both global and local adaptation efficiently, in which only lightweight adapters are tuned without adjusting the pre-trained model. Besides, to guide effective adaptation, we propose a mutual nearest neighbor local feature loss, which ensures proper dense local features are produced for local matching and avoids time-consuming spatial verification in re-ranking. Experimental results show that our method outperforms the state-of-the-art methods with less training data and training time, and uses about only 3% retrieval runtime of the two-stage VPR methods with RANSAC-based spatial verification. It ranks 1st on the MSLS challenge leaderboard (at the time of submission). The code is released at https://github.com/Lu-Feng/SelaVPR.",
  "revisions": [
    {
      "version": "v3",
      "updated": "2024-04-03T14:59:08.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "visual place recognition",
      "pre-trained model",
      "seamless adaptation",
      "neighbor local feature loss",
      "nearest neighbor local feature"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}