Last active
August 29, 2015 14:21
-
-
Save kshepp/53c0f63e6bdcade042cf to your computer and use it in GitHub Desktop.
[Working On] Tokenizing Twitter Statuses to Create Wordle Visual
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division | |
import nltk | |
import re | |
import pprint | |
from nltk import word_tokenize | |
from nltk.tokenize import RegexpTokenizer | |
import numpy as np | |
tokenizer = RegexpTokenizer(r'\w+') #this is important because it takes out the punctuation that Python can't read | |
f= open('Statuses.txt').read() # open the file | |
# number = re.search(r'\d+', f).group() | |
# print number | |
statuses = f.split(' | ') | |
exclude_words=["http", "rt", "co", "in", "of", "is", "you", "me", "my", "mine", "to", "the", "i", "them", "so", "t", "by", "?", "it", | |
"so", "continue", "will", "probably", "was", "one", "two", "aboard", "about", "above", "across", "after", "against", | |
"along", "amid", "among", "anti", "around" "as", "at", "before", "behind", "below", "beneath", "beside", "besides", | |
"between", "btw", "beyond", "but", "by", "concerning", "considering", "despite", "down", "during", "except", "excepting", | |
"excluding", "following", "for", "from", "in", "inside", "into", "like", "minus", "near", "of", "off", "on", "onto", | |
"opposite", "outside", "over", "past", "per", "plus", "regarding", "round", "since", "than", "through", "to", "toward", | |
"towards", "under", "underneath", "unlike", "until", "up", "upon", "versus", "via", "with", "within", "without", "?", | |
"!", "?", "out", "it", "as", "when", "will", "not", "probably", "was", "have", "has", "this", "that", "a", "for", | |
"htt", "https", "many", "we", "st", "if", "ok", "okay", "all", "and", "just", "did", "amp", "what", "your", "or", "either", | |
"k", "ain", "ain't", "here", "are", "there", "their", "htt?", "need", "basically", "way", "why", "who", "thru", | |
"can", "be", "get", "today", "guy", "day", "help", "time", "tomorrow", "tonight", "now", "try", "please"] | |
for status in statuses: #this tokenizes each sentence individually | |
tokens = tokenizer.tokenize(status) #create "tokens" out of statuses | |
words = [w.lower() for w in tokens] #make everything lower case | |
sentence = sorted(set(words)) #sort all the words in alphabetical order | |
for exclude in exclude_words: | |
for s in sentence: #This gets rid of all the small, unimportant connector words in the Tweets | |
if s==exclude: | |
sentence.remove(exclude) | |
filter (lambda x: '\\' in s, sentence) | |
if len(s) <= 2: #Gets rid of all the one and two letter words | |
sentence.remove(s) | |
for wordle_words in sentence: | |
number = any(char.isdigit() for char in wordle_words) #This gets rid of all the numbers from urls that were split up and from screen names | |
if number: | |
continue | |
else: | |
print wordle_words #This gives you the final list to copy and paste into Wordle |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment