{ "id": "1709.01716", "version": "v1", "published": "2017-09-06T08:32:50.000Z", "updated": "2017-09-06T08:32:50.000Z", "title": "Optimal Sub-sampling with Influence Functions", "authors": [ "Daniel Ting", "Eric Brochu" ], "categories": [ "stat.ML", "cs.LG" ], "abstract": "Sub-sampling is a common and often effective method to deal with the computational challenges of large datasets. However, for most statistical models, there is no well-motivated approach for drawing a non-uniform subsample. We show that the concept of an asymptotically linear estimator and the associated influence function leads to optimal sampling procedures for a wide class of popular models. Furthermore, for linear regression models which have well-studied procedures for non-uniform sub-sampling, we show our optimal influence function based method outperforms previous approaches. We empirically show the improved performance of our method on real datasets.", "revisions": [ { "version": "v1", "updated": "2017-09-06T08:32:50.000Z" } ], "analyses": { "keywords": [ "optimal sub-sampling", "optimal influence function", "linear regression models", "real datasets", "large datasets" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }