Skip to content

Instantly share code, notes, and snippets.

@kshepp
Last active August 29, 2015 14:21
Show Gist options
  • Save kshepp/53c0f63e6bdcade042cf to your computer and use it in GitHub Desktop.
Save kshepp/53c0f63e6bdcade042cf to your computer and use it in GitHub Desktop.
[Working On] Tokenizing Twitter Statuses to Create Wordle Visual
from __future__ import division
import nltk
import re
import pprint
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
import numpy as np
tokenizer = RegexpTokenizer(r'\w+') #this is important because it takes out the punctuation that Python can't read
f= open('Statuses.txt').read() # open the file
# number = re.search(r'\d+', f).group()
# print number
statuses = f.split(' | ')
exclude_words=["http", "rt", "co", "in", "of", "is", "you", "me", "my", "mine", "to", "the", "i", "them", "so", "t", "by", "?", "it",
"so", "continue", "will", "probably", "was", "one", "two", "aboard", "about", "above", "across", "after", "against",
"along", "amid", "among", "anti", "around" "as", "at", "before", "behind", "below", "beneath", "beside", "besides",
"between", "btw", "beyond", "but", "by", "concerning", "considering", "despite", "down", "during", "except", "excepting",
"excluding", "following", "for", "from", "in", "inside", "into", "like", "minus", "near", "of", "off", "on", "onto",
"opposite", "outside", "over", "past", "per", "plus", "regarding", "round", "since", "than", "through", "to", "toward",
"towards", "under", "underneath", "unlike", "until", "up", "upon", "versus", "via", "with", "within", "without", "?",
"!", "?", "out", "it", "as", "when", "will", "not", "probably", "was", "have", "has", "this", "that", "a", "for",
"htt", "https", "many", "we", "st", "if", "ok", "okay", "all", "and", "just", "did", "amp", "what", "your", "or", "either",
"k", "ain", "ain't", "here", "are", "there", "their", "htt?", "need", "basically", "way", "why", "who", "thru",
"can", "be", "get", "today", "guy", "day", "help", "time", "tomorrow", "tonight", "now", "try", "please"]
for status in statuses: #this tokenizes each sentence individually
tokens = tokenizer.tokenize(status) #create "tokens" out of statuses
words = [w.lower() for w in tokens] #make everything lower case
sentence = sorted(set(words)) #sort all the words in alphabetical order
for exclude in exclude_words:
for s in sentence: #This gets rid of all the small, unimportant connector words in the Tweets
if s==exclude:
sentence.remove(exclude)
filter (lambda x: '\\' in s, sentence)
if len(s) <= 2: #Gets rid of all the one and two letter words
sentence.remove(s)
for wordle_words in sentence:
number = any(char.isdigit() for char in wordle_words) #This gets rid of all the numbers from urls that were split up and from screen names
if number:
continue
else:
print wordle_words #This gives you the final list to copy and paste into Wordle
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment