Created
October 4, 2019 14:07
-
-
Save cozek/3b21dee77537561a309f006ab1b1b258 to your computer and use it in GitHub Desktop.
tweet tokenizing script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Tweet tokenizing script based on the one provided by pennington GloVe coupled with NLTK tweettokenizer | |
Date: 4th October 2019 | |
""" | |
import pickle | |
import sys | |
import re | |
import csv | |
from nltk.tokenize import TweetTokenizer | |
FLAGS = re.MULTILINE | re.DOTALL | |
def hashtag(text): | |
text = text.group() | |
hashtag_body = text[1:] | |
if hashtag_body.isupper(): | |
result = " {} ".format(hashtag_body.lower()) | |
else: | |
result = " ".join(["<hashtag>"] + re.split(r"(?=[A-Z])", hashtag_body, flags=FLAGS)) | |
return result | |
def allcaps(text): | |
text = text.group() | |
return text.lower() + " <allcaps>" | |
def tokenize(text): | |
# Different regex parts for smiley faces | |
eyes = r"[8:=;]" | |
nose = r"['`\-]?" | |
# function so code less repetitive | |
def re_sub(pattern, repl): | |
return re.sub(pattern, repl, text, flags=FLAGS) | |
text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>") | |
text = re_sub(r"@\w+", "<user>") | |
text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>") | |
text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>") | |
text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>") | |
text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>") | |
text = re_sub(r"/"," / ") | |
text = re_sub(r"<3","<heart>") | |
text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>") | |
text = re_sub(r"#\S+", hashtag) | |
text = re_sub(r"([!?.]){2,}", r"\1 <repeat>") | |
text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>") | |
## -- I just don't understand why the Ruby script adds <allcaps> to everything so I limited the selection. | |
# text = re_sub(r"([^a-z0-9()<>'`\-]){2,}", allcaps) | |
text = re_sub(r"([A-Z]){2,}", allcaps) | |
return text.lower() | |
def preprocess(data): | |
tk = TweetTokenizer() | |
return [tk.tokenize(tokenize(sent)) for sent in data] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment