{ "id": "2307.02071", "version": "v1", "published": "2023-07-05T07:26:27.000Z", "updated": "2023-07-05T07:26:27.000Z", "title": "A Comparison of Machine Learning Methods for Data with High-Cardinality Categorical Variables", "authors": [ "Fabio Sigrist" ], "categories": [ "cs.LG", "cs.AI", "stat.ML" ], "abstract": "High-cardinality categorical variables are variables for which the number of different levels is large relative to the sample size of a data set, or in other words, there are few data points per level. Machine learning methods can have difficulties with high-cardinality variables. In this article, we empirically compare several versions of two of the most successful machine learning methods, tree-boosting and deep neural networks, and linear mixed effects models using multiple tabular data sets with high-cardinality categorical variables. We find that, first, machine learning models with random effects have higher prediction accuracy than their classical counterparts without random effects, and, second, tree-boosting with random effects outperforms deep neural networks with random effects.", "revisions": [ { "version": "v1", "updated": "2023-07-05T07:26:27.000Z" } ], "analyses": { "keywords": [ "machine learning methods", "high-cardinality categorical variables", "random effects outperforms deep neural", "effects outperforms deep neural networks" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }