{ "id": "2310.12806", "version": "v1", "published": "2023-10-19T15:01:57.000Z", "updated": "2023-10-19T15:01:57.000Z", "title": "DCSI -- An improved measure of cluster separability based on separation and connectedness", "authors": [ "Jana Gauss", "Fabian Scheipl", "Moritz Herrmann" ], "categories": [ "stat.ML", "cs.LG" ], "abstract": "Whether class labels in a given data set correspond to meaningful clusters is crucial for the evaluation of clustering algorithms using real-world data sets. This property can be quantified by separability measures. A review of the existing literature shows that neither classification-based complexity measures nor cluster validity indices (CVIs) adequately incorporate the central aspects of separability for density-based clustering: between-class separation and within-class connectedness. A newly developed measure (density cluster separability index, DCSI) aims to quantify these two characteristics and can also be used as a CVI. Extensive experiments on synthetic data indicate that DCSI correlates strongly with the performance of DBSCAN measured via the adjusted rand index (ARI) but lacks robustness when it comes to multi-class data sets with overlapping classes that are ill-suited for density-based hard clustering. Detailed evaluation on frequently used real-world data sets shows that DCSI can correctly identify touching or overlapping classes that do not form meaningful clusters.", "revisions": [ { "version": "v1", "updated": "2023-10-19T15:01:57.000Z" } ], "analyses": { "keywords": [ "real-world data sets", "connectedness", "separation", "density cluster separability index", "meaningful clusters" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }