{ "id": "1512.01708", "version": "v1", "published": "2015-12-05T22:48:40.000Z", "updated": "2015-12-05T22:48:40.000Z", "title": "Variance Reduction for Distributed Stochastic Gradient Descent", "authors": [ "Soham De", "Gavin Taylor", "Tom Goldstein" ], "comment": "Preprint", "categories": [ "cs.LG", "cs.DC", "math.OC", "stat.ML" ], "abstract": "Variance reduction (VR) methods boost the performance of stochastic gradient descent (SGD) by enabling the use of larger stepsizes and preserving linear convergence rates. However, current variance reduced SGD methods require either high memory usage or require a full pass over the (large) data set at the end of each epoch to calculate the exact gradient of the objective function. This makes current VR methods impractical in distributed or parallel settings. In this paper, we propose a variance reduction method, called VR-lite, that does not require full gradient computations or extra storage. We explore distributed synchronous and asynchronous variants with both high and low communication latency. We find that our distributed algorithms scale linearly with the number of local workers and remain stable even with low communication frequency. We empirically compare both the sequential and distributed algorithms to state-of-the-art stochastic optimization methods, and find that our proposed algorithms consistently converge faster than other stochastic methods.", "revisions": [ { "version": "v1", "updated": "2015-12-05T22:48:40.000Z" } ], "analyses": { "keywords": [ "distributed stochastic gradient descent", "variance reduction", "state-of-the-art stochastic optimization methods", "current variance reduced sgd methods", "low communication" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }