-
-
Save ppope/0ff9fa359fb850ecf74d061f3072633a to your computer and use it in GitHub Desktop.
FORK: Python version of Ruby script to preprocess tweets for use in GloVe featurization http://nlp.stanford.edu/projects/glove/.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
preprocess-twitter.py | |
python preprocess-twitter.py "Some random text with #hashtags, @mentions and http://t.co/kdjfkdjf (links). :)" | |
Script for preprocessing tweets by Romain Paulus | |
with small modifications by Jeffrey Pennington | |
with translation to Python by Motoki Wu | |
Translation of Ruby script to create features for GloVe vectors for Twitter data. | |
http://nlp.stanford.edu/projects/glove/preprocess-twitter.rb | |
""" | |
import sys | |
import regex as re | |
FLAGS = re.MULTILINE | re.DOTALL | |
def hashtag(text): | |
text = text.group() | |
hashtag_body = text[1:] | |
if hashtag_body.isupper(): | |
result = " {} ".format(hashtag_body.lower()) | |
else: | |
result = " ".join(["<hashtag>"] + re.split(r"(?=[A-Z])", hashtag_body, flags=FLAGS)) | |
return result | |
def allcaps(text): | |
text = text.group() | |
return text.lower() + " <allcaps>" | |
def tokenize(text): | |
# Different regex parts for smiley faces | |
eyes = r"[8:=;]" | |
nose = r"['`\-]?" | |
# function so code less repetitive | |
def re_sub(pattern, repl): | |
return re.sub(pattern, repl, text, flags=FLAGS) | |
text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>") | |
text = re_sub(r"@\w+", "<user>") | |
text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>") | |
text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>") | |
text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>") | |
text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>") | |
text = re_sub(r"/"," / ") | |
text = re_sub(r"<3","<heart>") | |
text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>") | |
text = re_sub(r"#\S+", hashtag) | |
text = re_sub(r"([!?.]){2,}", r"\1 <repeat>") | |
text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>") | |
## -- I just don't understand why the Ruby script adds <allcaps> to everything so I limited the selection. | |
# text = re_sub(r"([^a-z0-9()<>'`\-]){2,}", allcaps) | |
text = re_sub(r"([A-Z]){2,}", allcaps) | |
return text.lower() | |
if __name__ == '__main__': | |
_, text = sys.argv | |
if text == "test": | |
text = "I TEST alllll kinds of #hashtags and #HASHTAGS, @mentions and 3000 (http://t.co/dkfjkdf). w/ <3 :) haha!!!!!" | |
tokens = tokenize(text) | |
print(tokens) |
Edited original to accommodate comments in original thread.
thank you so much! this works beautifully on python3 👍
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
FORK of https://gist.github.com/tokestermw/cb87a97113da12acb388