{ "id": "1906.08255", "version": "v1", "published": "2019-06-19T22:09:47.000Z", "updated": "2019-06-19T22:09:47.000Z", "title": "Training on test data: Removing near duplicates in Fashion-MNIST", "authors": [ "Christopher Geier" ], "categories": [ "cs.LG", "cs.CV", "stat.ML" ], "abstract": "MNIST and Fashion MNIST are extremely popular for testing in the machine learning space. Fashion MNIST improves on MNIST by introducing a harder problem, increasing the diversity of testing sets, and more accurately representing a modern computer vision task. In order to increase the data quality of FashionMNIST, this paper investigates near duplicate images between training and testing sets. Near-duplicates between testing and training sets artificially increase the testing accuracy of machine learning models. This paper identifies near-duplicate images in Fashion MNIST and proposes a dataset with near-duplicates removed.", "revisions": [ { "version": "v1", "updated": "2019-06-19T22:09:47.000Z" } ], "analyses": { "keywords": [ "test data", "fashion mnist", "paper identifies near-duplicate images", "fashion-mnist", "testing sets" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }