Last active
June 1, 2017 18:50
-
-
Save jeffThompson/6718129 to your computer and use it in GitHub Desktop.
A curatorial experiment through a residency with the Internet Archive; searches the Internet Archive for a given term, downloads the first result, parsing the most frequent word.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
''' | |
MOST FREQUENT WORD SEARCH | |
Jeff Thompson | 2013 | www.jeffreythompson.org | |
A curatorial experiment through a residency with the Internet Archive. | |
REQUIRES: | |
+ Natural Language Toolkit (NLTK) | |
+ Internet Archive search module | |
''' | |
import internetarchive as ia | |
from nltk.probability import FreqDist | |
from nltk.tokenize import RegexpTokenizer | |
import os | |
search_term = 'test' | |
collection = 'gutenberg' | |
download_folder = 'DownloadedFiles' | |
file_format = '.txt' | |
min_word_len = 5 | |
output_filename = search_term + '.csv' | |
pathway_string = search_term | |
# CREATE CSV IF IT DOESN'T ALREADY EXIST | |
if not os.path.exists(output_filename): | |
with open(output_filename, 'a') as csv: | |
csv.write('search_term,id,downloaded_file' + '\n' + search_term) | |
# RUN PROCESS UNTIL SOMETHING BREAKS :) | |
while True: | |
# SEARCH | |
search_query = search_term.lower() + ' AND (collection:' + collection + ')' | |
return_data = [ 'identifier' ] | |
print '\nsearching for "' + search_term + '"...' | |
search = ia.Search(search_query, return_data) | |
if (search.num_found > 0): | |
result = search.results.next() | |
id = result['identifier'] | |
print '\nfound:' | |
print ' id: ' + id | |
print ' url: ' + 'http://archive.org/details/' + id | |
else: | |
print ' no search results, sorry!' | |
break | |
# DOWNLOAD | |
print '\ndownloading first search result...' | |
download_string = 'wget -r -H -nc -np -nH -q --cut-dirs=2 -e robots=off -l1 -A ' + file_format + ' -P ' + download_folder + ' http://archive.org/download/' + id | |
os.system(download_string) | |
downloaded_files = os.listdir(download_folder + '/' + id) | |
for file in downloaded_files: | |
if 'meta' not in file and file.endswith('.txt'): | |
print ' ' + file | |
downloaded_filename = download_folder + '/' + id + '/' + file | |
break | |
# EXTRACT WORDS AND COUNT FREQUENCY | |
print '\ncounting word frequencies in "' + file + '"...' | |
text = '' | |
with open(downloaded_filename) as file: | |
for line in file: | |
text += line | |
tokenizer = RegexpTokenizer('\w+') | |
words = [] | |
for word in tokenizer.tokenize(text): | |
if len(word) > min_word_len: | |
words.append(word.lower()) | |
freq_dist = FreqDist(words) | |
# GET 10 MOST FREQUENT WORDS OVER A CERTAIN LENGTH | |
most_freq = [] | |
for i, word in enumerate(freq_dist.keys()): | |
if word == 'project' or word == 'gutenberg': # skip, just in case | |
continue | |
most_freq.append(word) | |
print ' ' + str(freq_dist[word]) + ': ' + word | |
if i >= 10: | |
break | |
# NEXT SEARCH TERM | |
for term in most_freq: | |
if term != search_term: | |
search_term = term | |
break | |
print '\nnext search term: "' + search_term + '"' | |
print '' | |
pathway_string += ' > ' + search_term | |
# SAVE RESULTS TO FILE | |
with open(output_filename, 'a') as csv: # append to existing file | |
csv.write('\n' + search_term + ',' + id + ',' + downloaded_filename) | |
# PRINT A DIVIDER AND CONTINUE | |
print '- ' * 20 | |
# DONE (or broken) | |
print '\n' + ('- ' * 20) + '\n' | |
print 'resulting pathway:\n' + pathway_string | |
print '\nDONE!' + ('\n' * 3) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment