jeffThompson · June 1, 2017 18:50
diff --git a/Word Frequency Search the Internet Archive b/Word Frequency Search the Internet Archive
 #!/usr/bin/python

 '''
 MOST FREQUENT WORD SEARCH
 Jeff Thompson | 2013 | www.jeffreythompson.org

 A curatorial experiment through a residency with the Internet Archive.

 REQUIRES:
 + Natural Language Toolkit (NLTK)
 + Internet Archive search module
 '''

 import internetarchive as ia
 from nltk.probability import FreqDist
 from nltk.tokenize import RegexpTokenizer
 import os
 
 search_term = 'test'
 collection = 'gutenberg'
 download_folder = 'DownloadedFiles'
 file_format = '.txt'
 min_word_len = 5
 output_filename = search_term + '.csv'
 pathway_string = search_term

 # CREATE CSV IF IT DOESN'T ALREADY EXIST
 if not os.path.exists(output_filename):
 	with open(output_filename, 'a') as csv:
 		csv.write('search_term,id,downloaded_file' + '\n' + search_term)


 # RUN PROCESS UNTIL SOMETHING BREAKS :)
 while True:

 	# SEARCH
 	search_query = search_term.lower() + ' AND (collection:' + collection + ')'
 	return_data = [ 'identifier' ]

 	print '\nsearching for "' + search_term + '"...'
 	search = ia.Search(search_query, return_data)
 	if (search.num_found > 0):
 		result = search.results.next()
 		id = result['identifier']
 		print '\nfound:'
 		print '  id:  ' + id
 		print '  url: ' + 'http://archive.org/details/' + id

 	else:
 		print '  no search results, sorry!'
 		break


 	# DOWNLOAD
 	print '\ndownloading first search result...'
 	download_string = 'wget -r -H -nc -np -nH -q --cut-dirs=2 -e robots=off -l1 -A ' + file_format + ' -P ' + download_folder + ' http://archive.org/download/' + id
 	os.system(download_string)

 	downloaded_files = os.listdir(download_folder + '/' + id)
 	for file in downloaded_files:
 		if 'meta' not in file and file.endswith('.txt'):
 			print '  ' + file
 			downloaded_filename = download_folder + '/' + id + '/' + file
 			break


 	# EXTRACT WORDS AND COUNT FREQUENCY
 	print '\ncounting word frequencies in "' + file + '"...'
 	text = ''
 	with open(downloaded_filename) as file:
 		for line in file:
 			text += line
 	tokenizer = RegexpTokenizer('\w+')
 	words = []
 	for word in tokenizer.tokenize(text):
 		if len(word) > min_word_len:
 			words.append(word.lower())
 	freq_dist = FreqDist(words)


 	# GET 10 MOST FREQUENT WORDS OVER A CERTAIN LENGTH
 	most_freq = []
 	for i, word in enumerate(freq_dist.keys()):
 		if word == 'project' or word == 'gutenberg':		# skip, just in case
 			continue
 		most_freq.append(word)
 		print '  ' + str(freq_dist[word]) + ': ' + word
 		if i >= 10:
 			break


 	# NEXT SEARCH TERM
 	for term in most_freq:
 		if term != search_term:
 			search_term = term
 			break
 	print '\nnext search term: "' + search_term + '"'
 	print ''
 	pathway_string += ' > ' + search_term


 	# SAVE RESULTS TO FILE
 	with open(output_filename, 'a') as csv:		# append to existing file
 		csv.write('\n' + search_term + ',' + id + ',' + downloaded_filename)

 	
 	# PRINT A DIVIDER AND CONTINUE
 	print '- ' * 20

 # DONE (or broken)
 print '\n' + ('- ' * 20) + '\n'
 print 'resulting pathway:\n' + pathway_string
 print '\nDONE!' + ('\n' * 3)
	#!/usr/bin/python

	'''
	MOST FREQUENT WORD SEARCH
	Jeff Thompson \| 2013 \| www.jeffreythompson.org

	A curatorial experiment through a residency with the Internet Archive.

	REQUIRES:
	+ Natural Language Toolkit (NLTK)
	+ Internet Archive search module
	'''

	import internetarchive as ia
	from nltk.probability import FreqDist
	from nltk.tokenize import RegexpTokenizer
	import os

	search_term = 'test'
	collection = 'gutenberg'
	download_folder = 'DownloadedFiles'
	file_format = '.txt'
	min_word_len = 5
	output_filename = search_term + '.csv'
	pathway_string = search_term

	# CREATE CSV IF IT DOESN'T ALREADY EXIST
	if not os.path.exists(output_filename):
	with open(output_filename, 'a') as csv:
	csv.write('search_term,id,downloaded_file' + '\n' + search_term)


	# RUN PROCESS UNTIL SOMETHING BREAKS :)
	while True:

	# SEARCH
	search_query = search_term.lower() + ' AND (collection:' + collection + ')'
	return_data = [ 'identifier' ]

	print '\nsearching for "' + search_term + '"...'
	search = ia.Search(search_query, return_data)
	if (search.num_found > 0):
	result = search.results.next()
	id = result['identifier']
	print '\nfound:'
	print ' id: ' + id
	print ' url: ' + 'http://archive.org/details/' + id

	else:
	print ' no search results, sorry!'
	break


	# DOWNLOAD
	print '\ndownloading first search result...'
	download_string = 'wget -r -H -nc -np -nH -q --cut-dirs=2 -e robots=off -l1 -A ' + file_format + ' -P ' + download_folder + ' http://archive.org/download/' + id
	os.system(download_string)

	downloaded_files = os.listdir(download_folder + '/' + id)
	for file in downloaded_files:
	if 'meta' not in file and file.endswith('.txt'):
	print ' ' + file
	downloaded_filename = download_folder + '/' + id + '/' + file
	break


	# EXTRACT WORDS AND COUNT FREQUENCY
	print '\ncounting word frequencies in "' + file + '"...'
	text = ''
	with open(downloaded_filename) as file:
	for line in file:
	text += line
	tokenizer = RegexpTokenizer('\w+')
	words = []
	for word in tokenizer.tokenize(text):
	if len(word) > min_word_len:
	words.append(word.lower())
	freq_dist = FreqDist(words)


	# GET 10 MOST FREQUENT WORDS OVER A CERTAIN LENGTH
	most_freq = []
	for i, word in enumerate(freq_dist.keys()):
	if word == 'project' or word == 'gutenberg': # skip, just in case
	continue
	most_freq.append(word)
	print ' ' + str(freq_dist[word]) + ': ' + word
	if i >= 10:
	break


	# NEXT SEARCH TERM
	for term in most_freq:
	if term != search_term:
	search_term = term
	break
	print '\nnext search term: "' + search_term + '"'
	print ''
	pathway_string += ' > ' + search_term


	# SAVE RESULTS TO FILE
	with open(output_filename, 'a') as csv: # append to existing file
	csv.write('\n' + search_term + ',' + id + ',' + downloaded_filename)


	# PRINT A DIVIDER AND CONTINUE
	print '- ' * 20

	# DONE (or broken)
	print '\n' + ('- ' * 20) + '\n'
	print 'resulting pathway:\n' + pathway_string
	print '\nDONE!' + ('\n' * 3)