Skip to content

Instantly share code, notes, and snippets.

@randyzwitch
Created July 31, 2013 14:38
Show Gist options
  • Save randyzwitch/6122550 to your computer and use it in GitHub Desktop.
Save randyzwitch/6122550 to your computer and use it in GitHub Desktop.
Python word search
import collections
import nltk
#Dictionary from Unix
internal_dict = open("/usr/share/dict/words")
#Stopwords corpus from NLTK
stopwords = nltk.corpus.stopwords.words('english')
#Build english_dictionary of prospect words
english_dictionary = []
for line in internal_dict:
if line not in stopwords and len(line) > 4: #make sure only "big", useful words included
english_dictionary.append(line.rstrip('\n'))
#How many words are in the complete dictionary?
len(english_dictionary)
#Import urls
urls = [line for line in open("/path/to/urls/file.csv")]
#Build counter dictionary
wordcount = collections.Counter()
for word in english_dictionary: #Loop over all possible English words
for url in urls: #Loop over all urls in list
if word in url:
wordcount[word] += 1 #Once word found, add to dictionary counter
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment