{
  "id": "1802.02163",
  "version": "v1",
  "published": "2018-02-06T19:00:12.000Z",
  "updated": "2018-02-06T19:00:12.000Z",
  "title": "How to Make Causal Inferences Using Texts",
  "authors": [
    "Naoki Egami",
    "Christian J. Fong",
    "Justin Grimmer",
    "Margaret E. Roberts",
    "Brandon M. Stewart"
  ],
  "comment": "47 pages",
  "categories": [
    "stat.ML",
    "cs.CL",
    "stat.ME"
  ],
  "abstract": "New text as data techniques offer a great promise: the ability to inductively discover measures that are useful for testing social science theories of interest from large collections of text. We introduce a conceptual framework for making causal inferences with discovered measures as a treatment or outcome. Our framework enables researchers to discover high-dimensional textual interventions and estimate the ways that observed treatments affect text-based outcomes. We argue that nearly all text-based causal inferences depend upon a latent representation of the text and we provide a framework to learn the latent representation. But estimating this latent representation, we show, creates new risks: we may introduce an identification problem or overfit. To address these risks we describe a split-sample framework and apply it to estimate causal effects from an experiment on immigration attitudes and a study on bureaucratic response. Our work provides a rigorous foundation for text-based causal inferences.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2018-02-06T19:00:12.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "latent representation",
      "text-based causal inferences",
      "estimate causal effects",
      "testing social science theories",
      "treatments affect text-based outcomes"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 47,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}