{ "id": "1802.02163", "version": "v1", "published": "2018-02-06T19:00:12.000Z", "updated": "2018-02-06T19:00:12.000Z", "title": "How to Make Causal Inferences Using Texts", "authors": [ "Naoki Egami", "Christian J. Fong", "Justin Grimmer", "Margaret E. Roberts", "Brandon M. Stewart" ], "comment": "47 pages", "categories": [ "stat.ML", "cs.CL", "stat.ME" ], "abstract": "New text as data techniques offer a great promise: the ability to inductively discover measures that are useful for testing social science theories of interest from large collections of text. We introduce a conceptual framework for making causal inferences with discovered measures as a treatment or outcome. Our framework enables researchers to discover high-dimensional textual interventions and estimate the ways that observed treatments affect text-based outcomes. We argue that nearly all text-based causal inferences depend upon a latent representation of the text and we provide a framework to learn the latent representation. But estimating this latent representation, we show, creates new risks: we may introduce an identification problem or overfit. To address these risks we describe a split-sample framework and apply it to estimate causal effects from an experiment on immigration attitudes and a study on bureaucratic response. Our work provides a rigorous foundation for text-based causal inferences.", "revisions": [ { "version": "v1", "updated": "2018-02-06T19:00:12.000Z" } ], "analyses": { "keywords": [ "latent representation", "text-based causal inferences", "estimate causal effects", "testing social science theories", "treatments affect text-based outcomes" ], "note": { "typesetting": "TeX", "pages": 47, "language": "en", "license": "arXiv", "status": "editable" } } }