{
  "id": "1705.07592",
  "version": "v1",
  "published": "2017-05-22T07:47:24.000Z",
  "updated": "2017-05-22T07:47:24.000Z",
  "title": "Improved Clustering with Augmented k-means",
  "authors": [
    "J. Andrew Howe"
  ],
  "categories": [
    "stat.ML"
  ],
  "abstract": "Identifying a set of homogeneous clusters in a heterogeneous dataset is one of the most important classes of problems in statistical modeling. In the realm of unsupervised partitional clustering, k-means is a very important algorithm for this. In this technical report, we develop a new k-means variant called Augmented k-means, which is a hybrid of k-means and logistic regression. During each iteration, logistic regression is used to predict the current cluster labels, and the cluster belonging probabilities are used to control the subsequent re-estimation of cluster means. Observations which can't be firmly identified into clusters are excluded from the re-estimation step. This can be valuable when the data exhibit many characteristics of real datasets such as heterogeneity, non-sphericity, substantial overlap, and high scatter. Augmented k-means frequently outperforms k-means by more accurately classifying observations into known clusters and / or converging in fewer iterations. We demonstrate this on both simulated and real datasets. Our algorithm is implemented in Python and will be available with this report.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2017-05-22T07:47:24.000Z"
    }
  ],
  "analyses": {
    "subjects": [
      "62H30",
      "I.5.3",
      "G.3",
      "G.4"
    ],
    "keywords": [
      "logistic regression",
      "real datasets",
      "augmented k-means frequently outperforms k-means",
      "current cluster labels",
      "important classes"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}