{
  "id": "2208.13933",
  "version": "v1",
  "published": "2022-08-30T00:08:37.000Z",
  "updated": "2022-08-30T00:08:37.000Z",
  "title": "Using Taylor-Approximated Gradients to Improve the Frank-Wolfe Method for Empirical Risk Minimization",
  "authors": [
    "Zikai Xiong",
    "Robert M. Freund"
  ],
  "comment": "27 pages, 2 figures",
  "categories": [
    "cs.LG",
    "math.OC",
    "stat.ML"
  ],
  "abstract": "The Frank-Wolfe method has become increasingly useful in statistical and machine learning applications, due to the structure-inducing properties of the iterates, and especially in settings where linear minimization over the feasible set is more computationally efficient than projection. In the setting of Empirical Risk Minimization -- one of the fundamental optimization problems in statistical and machine learning -- the computational effectiveness of Frank-Wolfe methods typically grows linearly in the number of data observations $n$. This is in stark contrast to the case for typical stochastic projection methods. In order to reduce this dependence on $n$, we look to second-order smoothness of typical smooth loss functions (least squares loss and logistic loss, for example) and we propose amending the Frank-Wolfe method with Taylor series-approximated gradients, including variants for both deterministic and stochastic settings. Compared with current state-of-the-art methods in the regime where the optimality tolerance $\\varepsilon$ is sufficiently small, our methods are able to simultaneously reduce the dependence on large $n$ while obtaining optimal convergence rates of Frank-Wolfe methods, in both the convex and non-convex settings. We also propose a novel adaptive step-size approach for which we have computational guarantees. Last of all, we present computational experiments which show that our methods exhibit very significant speed-ups over existing methods on real-world datasets for both convex and non-convex binary classification problems.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2022-08-30T00:08:37.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "empirical risk minimization",
      "taylor-approximated gradients",
      "non-convex binary classification problems",
      "computational",
      "frank-wolfe methods typically grows"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 27,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}