{
  "id": "1909.03681",
  "version": "v1",
  "published": "2019-09-09T07:43:47.000Z",
  "updated": "2019-09-09T07:43:47.000Z",
  "title": "Outlier Detection in High Dimensional Data",
  "authors": [
    "Firuz Kamalov",
    "Ho Hon Leung"
  ],
  "categories": [
    "cs.LG",
    "cs.AI",
    "stat.ML"
  ],
  "abstract": "High-dimensional data poses unique challenges in outlier detection process. Most of the existing algorithms fail to properly address the issues stemming from a large number of features. In particular, outlier detection algorithms perform poorly on data set of small size with a large number of features. In this paper, we propose a novel outlier detection algorithm based on principal component analysis and kernel density estimation. The proposed method is designed to address the challenges of dealing with high-dimensional data by projecting the original data onto a smaller space and using the innate structure of the data to calculate anomaly scores for each data point. Numerical experiments on synthetic and real-life data show that our method performs well on high-dimensional data. In particular, the proposed method outperforms the benchmark methods as measured by the $F_1$-score. Our method also produces better-than-average execution times compared to the benchmark methods.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2019-09-09T07:43:47.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "high dimensional data",
      "detection algorithms perform",
      "outlier detection algorithm",
      "high-dimensional data poses unique challenges",
      "produces better-than-average execution times"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 0,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}