{ "id": "2102.09507", "version": "v1", "published": "2021-02-18T17:48:49.000Z", "updated": "2021-02-18T17:48:49.000Z", "title": "Regular Expressions for Fast-response COVID-19 Text Classification", "authors": [ "Igor L. Markov", "Jacqueline Liu", "Adam Vagner" ], "comment": "10 pages, 7 tables", "categories": [ "cs.CL", "cs.LG", "cs.SI" ], "abstract": "Text classifiers are at the core of many NLP applications and use a variety of algorithmic approaches and software. This paper describes how Facebook determines if a given piece of text - anything from a hashtag to a post - belongs to a narrow topic such as COVID-19. To fully define a topic and evaluate classifier performance we employ human-guided iterations of keyword discovery, but do not require labeled data. For COVID-19, we build two sets of regular expressions: (1) for 66 languages, with 99% precision and recall >50%, (2) for the 11 most common languages, with precision >90% and recall >90%. Regular expressions enable low-latency queries from multiple platforms. \\hush{PHP, Python, Java and SQL code} Response to challenges like COVID-19 is fast and so are revisions. Comparisons to a DNN classifier show explainable results, higher precision and recall, and less overfitting. Our learnings can be applied to other narrow-topic classifiers.", "revisions": [ { "version": "v1", "updated": "2021-02-18T17:48:49.000Z" } ], "analyses": { "keywords": [ "text classification", "fast-response", "regular expressions enable low-latency queries", "evaluate classifier performance", "higher precision" ], "note": { "typesetting": "TeX", "pages": 10, "language": "en", "license": "arXiv", "status": "editable" } } }