{ "id": "2003.09461", "version": "v1", "published": "2020-03-20T18:57:52.000Z", "updated": "2020-03-20T18:57:52.000Z", "title": "Adversarial Robustness on In- and Out-Distribution Improves Explainability", "authors": [ "Maximilian Augustin", "Alexander Meinke", "Matthias Hein" ], "categories": [ "cs.LG", "cs.CV", "stat.ML" ], "abstract": "Neural networks have led to major improvements in image classification but suffer from being non-robust to adversarial changes, unreliable uncertainty estimates on out-distribution samples and their inscrutable black-box decisions. In this work we propose RATIO, a training procedure for Robustness via Adversarial Training on In- and Out-distribution, which leads to robust models with reliable and robust confidence estimates on the out-distribution. RATIO has similar generative properties to adversarial training so that visual counterfactuals produce class specific features. While adversarial training comes at the price of lower clean accuracy, RATIO achieves state-of-the-art $l_2$-adversarial robustness on CIFAR10 and maintains better clean accuracy.", "revisions": [ { "version": "v1", "updated": "2020-03-20T18:57:52.000Z" } ], "analyses": { "keywords": [ "adversarial robustness", "out-distribution", "counterfactuals produce class specific features", "visual counterfactuals produce class specific", "maintains better clean accuracy" ], "note": { "typesetting": "TeX", "pages": 0, "language": "en", "license": "arXiv", "status": "editable" } } }