{
  "id": "1906.08255",
  "version": "v1",
  "published": "2019-06-19T22:09:47.000Z",
  "updated": "2019-06-19T22:09:47.000Z",
  "title": "Training on test data: Removing near duplicates in Fashion-MNIST",
  "authors": [
    "Christopher Geier"
  ],
  "categories": [
    "cs.LG",
    "cs.CV",
    "stat.ML"
  ],
  "abstract": "MNIST and Fashion MNIST are extremely popular for testing in the machine learning space. Fashion MNIST improves on MNIST by introducing a harder problem, increasing the diversity of testing sets, and more accurately representing a modern computer vision task. In order to increase the data quality of FashionMNIST, this paper investigates near duplicate images between training and testing sets. Near-duplicates between testing and training sets artificially increase the testing accuracy of machine learning models. This paper identifies near-duplicate images in Fashion MNIST and proposes a dataset with near-duplicates removed.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2019-06-19T22:09:47.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "test data",
      "fashion mnist",
      "paper identifies near-duplicate images",
      "fashion-mnist",
      "testing sets"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}