{
  "id": "2102.09507",
  "version": "v1",
  "published": "2021-02-18T17:48:49.000Z",
  "updated": "2021-02-18T17:48:49.000Z",
  "title": "Regular Expressions for Fast-response COVID-19 Text Classification",
  "authors": [
    "Igor L. Markov",
    "Jacqueline Liu",
    "Adam Vagner"
  ],
  "comment": "10 pages, 7 tables",
  "categories": [
    "cs.CL",
    "cs.LG",
    "cs.SI"
  ],
  "abstract": "Text classifiers are at the core of many NLP applications and use a variety of algorithmic approaches and software. This paper describes how Facebook determines if a given piece of text - anything from a hashtag to a post - belongs to a narrow topic such as COVID-19. To fully define a topic and evaluate classifier performance we employ human-guided iterations of keyword discovery, but do not require labeled data. For COVID-19, we build two sets of regular expressions: (1) for 66 languages, with 99% precision and recall >50%, (2) for the 11 most common languages, with precision >90% and recall >90%. Regular expressions enable low-latency queries from multiple platforms. \\hush{PHP, Python, Java and SQL code} Response to challenges like COVID-19 is fast and so are revisions. Comparisons to a DNN classifier show explainable results, higher precision and recall, and less overfitting. Our learnings can be applied to other narrow-topic classifiers.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2021-02-18T17:48:49.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "text classification",
      "fast-response",
      "regular expressions enable low-latency queries",
      "evaluate classifier performance",
      "higher precision"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 10,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}