{ "id": "2104.11315", "version": "v1", "published": "2021-04-22T20:49:40.000Z", "updated": "2021-04-22T20:49:40.000Z", "title": "SPECTRE: Defending Against Backdoor Attacks Using Robust Statistics", "authors": [ "Jonathan Hayase", "Weihao Kong", "Raghav Somani", "Sewoong Oh" ], "comment": "29 pages 19 figures", "categories": [ "cs.LG", "cs.AI", "stat.ML" ], "abstract": "Modern machine learning increasingly requires training on a large collection of data from multiple sources, not all of which can be trusted. A particularly concerning scenario is when a small fraction of poisoned data changes the behavior of the trained model when triggered by an attacker-specified watermark. Such a compromised model will be deployed unnoticed as the model is accurate otherwise. There have been promising attempts to use the intermediate representations of such a model to separate corrupted examples from clean ones. However, these defenses work only when a certain spectral signature of the poisoned examples is large enough for detection. There is a wide range of attacks that cannot be protected against by the existing defenses. We propose a novel defense algorithm using robust covariance estimation to amplify the spectral signature of corrupted data. This defense provides a clean model, completely removing the backdoor, even in regimes where previous methods have no hope of detecting the poisoned examples. Code and pre-trained models are available at https://github.com/SewoongLab/spectre-defense .", "revisions": [ { "version": "v1", "updated": "2021-04-22T20:49:40.000Z" } ], "analyses": { "keywords": [ "backdoor attacks", "robust statistics", "spectral signature", "novel defense algorithm", "poisoned examples" ], "note": { "typesetting": "TeX", "pages": 29, "language": "en", "license": "arXiv", "status": "editable" } } }