{ "id": "2001.08677", "version": "v1", "published": "2020-01-23T17:19:29.000Z", "updated": "2020-01-23T17:19:29.000Z", "title": "Towards Automatic Clustering Analysis using Traces of Information Gain: The InfoGuide Method", "authors": [ "Paulo Rocha", "Diego Pinheiro", "Martin Cadeiras", "Carmelo Bastos-Filho" ], "comment": "The 33rd International FLAIRS Conference", "categories": [ "cs.LG", "stat.ML" ], "abstract": "Clustering analysis has become a ubiquitous information retrieval tool in a wide range of domains, but a more automatic framework is still lacking. Though internal metrics are the key players towards a successful retrieval of clusters, their effectiveness on real-world datasets remains not fully understood, mainly because of their unrealistic assumptions underlying datasets. We hypothesized that capturing {\\it traces of information gain} between increasingly complex clustering retrievals---{\\it InfoGuide}---enables an automatic clustering analysis with improved clustering retrievals. We validated the {\\it InfoGuide} hypothesis by capturing the traces of information gain using the Kolmogorov-Smirnov statistic and comparing the clusters retrieved by {\\it InfoGuide} against those retrieved by other commonly used internal metrics in artificially-generated, benchmarks, and real-world datasets. Our results suggested that {\\it InfoGuide} can enable a more automatic clustering analysis and may be more suitable for retrieving clusters in real-world datasets displaying nontrivial statistical properties.", "revisions": [ { "version": "v1", "updated": "2020-01-23T17:19:29.000Z" } ], "analyses": { "keywords": [ "automatic clustering analysis", "information gain", "infoguide method", "real-world datasets", "displaying nontrivial statistical properties" ], "tags": [ "conference paper" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }