{ "id": "2309.17347", "version": "v1", "published": "2023-09-27T11:47:05.000Z", "updated": "2023-09-27T11:47:05.000Z", "title": "Demographic Parity: Mitigating Biases in Real-World Data", "authors": [ "Orestis Loukas", "Ho-Ryun Chung" ], "comment": "24 pages, 16 Figures, Python code attached", "categories": [ "cs.LG", "cs.CY" ], "abstract": "Computer-based decision systems are widely used to automate decisions in many aspects of everyday life, which include sensitive areas like hiring, loaning and even criminal sentencing. A decision pipeline heavily relies on large volumes of historical real-world data for training its models. However, historical training data often contains gender, racial or other biases which are propagated to the trained models influencing computer-based decisions. In this work, we propose a robust methodology that guarantees the removal of unwanted biases while maximally preserving classification utility. Our approach can always achieve this in a model-independent way by deriving from real-world data the asymptotic dataset that uniquely encodes demographic parity and realism. As a proof-of-principle, we deduce from public census records such an asymptotic dataset from which synthetic samples can be generated to train well-established classifiers. Benchmarking the generalization capability of these classifiers trained on our synthetic data, we confirm the absence of any explicit or implicit bias in the computer-aided decision.", "revisions": [ { "version": "v1", "updated": "2023-09-27T11:47:05.000Z" } ], "analyses": { "keywords": [ "real-world data", "mitigating biases", "asymptotic dataset", "decision pipeline heavily relies", "uniquely encodes demographic parity" ], "note": { "typesetting": "TeX", "pages": 24, "language": "en", "license": "arXiv", "status": "editable" } } }