{
  "id": "1605.03481",
  "version": "v1",
  "published": "2016-05-11T15:30:09.000Z",
  "updated": "2016-05-11T15:30:09.000Z",
  "title": "Tweet2Vec: Character-Based Distributed Representations for Social Media",
  "authors": [
    "Bhuwan Dhingra",
    "Zhong Zhou",
    "Dylan Fitzpatrick",
    "Michael Muehl",
    "William W. Cohen"
  ],
  "comment": "6 pages, 2 figures, 4 tables, accepted as conference paper at ACL 2016",
  "categories": [
    "cs.LG",
    "cs.CL"
  ],
  "abstract": "Text from social media provides a set of challenges that can cause traditional NLP approaches to fail. Informal language, spelling errors, abbreviations, and special characters are all commonplace in these posts, leading to a prohibitively large vocabulary size for word-level approaches. We propose a character composition model, tweet2vec, which finds vector-space representations of whole tweets by learning complex, non-local dependencies in character sequences. The proposed model outperforms a word-level baseline at predicting user-annotated hashtags associated with the posts, doing significantly better when the input contains many out-of-vocabulary words or unusual character sequences. Our tweet2vec encoder is publicly available.",
  "revisions": [
    {
      "version": "v1",
      "updated": "2016-05-11T15:30:09.000Z"
    }
  ],
  "analyses": {
    "keywords": [
      "social media",
      "character-based distributed representations",
      "finds vector-space representations",
      "character composition model",
      "traditional nlp approaches"
    ],
    "tags": [
      "conference paper"
    ],
    "note": {
      "typesetting": "TeX",
      "pages": 6,
      "language": "en",
      "license": "arXiv",
      "status": "editable"
    }
  }
}