{ "id": "1605.03481", "version": "v1", "published": "2016-05-11T15:30:09.000Z", "updated": "2016-05-11T15:30:09.000Z", "title": "Tweet2Vec: Character-Based Distributed Representations for Social Media", "authors": [ "Bhuwan Dhingra", "Zhong Zhou", "Dylan Fitzpatrick", "Michael Muehl", "William W. Cohen" ], "comment": "6 pages, 2 figures, 4 tables, accepted as conference paper at ACL 2016", "categories": [ "cs.LG", "cs.CL" ], "abstract": "Text from social media provides a set of challenges that can cause traditional NLP approaches to fail. Informal language, spelling errors, abbreviations, and special characters are all commonplace in these posts, leading to a prohibitively large vocabulary size for word-level approaches. We propose a character composition model, tweet2vec, which finds vector-space representations of whole tweets by learning complex, non-local dependencies in character sequences. The proposed model outperforms a word-level baseline at predicting user-annotated hashtags associated with the posts, doing significantly better when the input contains many out-of-vocabulary words or unusual character sequences. Our tweet2vec encoder is publicly available.", "revisions": [ { "version": "v1", "updated": "2016-05-11T15:30:09.000Z" } ], "analyses": { "keywords": [ "social media", "character-based distributed representations", "finds vector-space representations", "character composition model", "traditional nlp approaches" ], "tags": [ "conference paper" ], "note": { "typesetting": "TeX", "pages": 6, "language": "en", "license": "arXiv", "status": "editable" } } }