-
-
Save tokestermw/cb87a97113da12acb388 to your computer and use it in GitHub Desktop.
""" | |
preprocess-twitter.py | |
python preprocess-twitter.py "Some random text with #hashtags, @mentions and http://t.co/kdjfkdjf (links). :)" | |
Script for preprocessing tweets by Romain Paulus | |
with small modifications by Jeffrey Pennington | |
with translation to Python by Motoki Wu | |
Translation of Ruby script to create features for GloVe vectors for Twitter data. | |
http://nlp.stanford.edu/projects/glove/preprocess-twitter.rb | |
""" | |
import sys | |
import re | |
FLAGS = re.MULTILINE | re.DOTALL | |
def hashtag(text): | |
text = text.group() | |
hashtag_body = text[1:] | |
if hashtag_body.isupper(): | |
result = "<hashtag> {} <allcaps>".format(hashtag_body) | |
else: | |
result = " ".join(["<hashtag>"] + re.split(r"(?=[A-Z])", hashtag_body, flags=FLAGS)) | |
return result | |
def allcaps(text): | |
text = text.group() | |
return text.lower() + " <allcaps>" | |
def tokenize(text): | |
# Different regex parts for smiley faces | |
eyes = r"[8:=;]" | |
nose = r"['`\-]?" | |
# function so code less repetitive | |
def re_sub(pattern, repl): | |
return re.sub(pattern, repl, text, flags=FLAGS) | |
text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>") | |
text = re_sub(r"/"," / ") | |
text = re_sub(r"@\w+", "<user>") | |
text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>") | |
text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>") | |
text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>") | |
text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>") | |
text = re_sub(r"<3","<heart>") | |
text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>") | |
text = re_sub(r"#\S+", hashtag) | |
text = re_sub(r"([!?.]){2,}", r"\1 <repeat>") | |
text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>") | |
## -- I just don't understand why the Ruby script adds <allcaps> to everything so I limited the selection. | |
# text = re_sub(r"([^a-z0-9()<>'`\-]){2,}", allcaps) | |
text = re_sub(r"([A-Z]){2,}", allcaps) | |
return text.lower() | |
if __name__ == '__main__': | |
_, text = sys.argv | |
if text == "test": | |
text = "I TEST alllll kinds of #hashtags and #HASHTAGS, @mentions and 3000 (http://t.co/dkfjkdf). w/ <3 :) haha!!!!!" | |
tokens = tokenize(text) | |
print tokens |
What's the license on this? Same as the original?
Public Domain and Dedication license http://opendatacommons.org/licenses/pddl/
Found some bugs:
- It doesn't add the tag to hashtags in caps.
- It doesn't split hashtags as: #WelcomeRefugees = welcome refugees as the original does.
I paste here my version:
`"""
preprocess-twitter.py
python preprocess-twitter.py "Some random text with #hashtags, @mentions and http://t.co/kdjfkdjf (links). :)"
Script for preprocessing tweets by Romain Paulus
with small modifications by Jeffrey Pennington
with translation to Python by Motoki Wu
Translation of Ruby script to create features for GloVe vectors for Twitter data.
http://nlp.stanford.edu/projects/glove/preprocess-twitter.rb
"""
import sys
import regex as re
FLAGS = re.MULTILINE | re.DOTALL
def hashtag(text):
text = text.group()
hashtag_body = text[1:]
if hashtag_body.isupper():
result = " {} ".format(hashtag_body.lower())
else:
result = " ".join([""] + [re.sub(r"([A-Z])",r" \1", hashtag_body, flags=FLAGS)])
return result
def allcaps(text):
text = text.group()
return text.lower() + " "
def tweet_preprocessing(text):
# Different regex parts for smiley faces
eyes = r"[8:=;]"
nose = r"['`-]?"
# function so code less repetitive
def re_sub(pattern, repl):
return re.sub(pattern, repl, text, flags=FLAGS)
text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
text = re_sub(r"@\w+", "<user>")
text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
text = re_sub(r"/"," / ")
text = re_sub(r"<3","<heart>")
text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
text = re_sub(r"#\S+", hashtag)
text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")
## -- I just don't understand why the Ruby script adds <allcaps> to everything so I limited the selection.
# text = re_sub(r"([^a-z0-9()<>'`\-]){2,}", allcaps)
text = re_sub(r"([A-Z]){2,}", allcaps)
return text.lower()
if name == 'main':
_, text = sys.argv
if text == "test":
text = "I TEST alllll kinds of #hashtags and #HASHTAGS and #HashTags, @mentions and 3000 (http://t.co/dkfjkdf). w/ <3 :) haha!!!!!"
tokens = tweet_preprocessing(text)
print tokens`
Forked version with edits suggested by above comments: https://gist.github.com/ppope/0ff9fa359fb850ecf74d061f3072633a