Created
February 22, 2012 19:01
-
-
Save pbdeuchler/1886665 to your computer and use it in GitHub Desktop.
CSCI 345 Project 1
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import shelve | |
import os | |
import sys | |
#Global vars | |
SHELF = os.getcwd() + '/' +'store' #variable for db name | |
LOC = os.getcwd() + '/files/' #variable for corpus location (os.getcwd() gets the current directory) | |
write = sys.stdout.write #For progress bar | |
db = {} | |
#----------------------------------------------------------------------------------------------- | |
#Main function | |
def main(): | |
directory = os.listdir(LOC) | |
count = 0 | |
print "Indexing..." | |
for x in directory: #loop through files in supplied directory | |
count = count + 1 #For progress tracking purposes | |
if x is '.DS_Store': #Excludes those f***ing DS_STORE files | |
return | |
x = LOC + x | |
FileRead(x) #Creates FileRead object, which then processes the data | |
#Show queries for required terms | |
write(getpostings('file')) | |
write(getpostings('performance')) | |
write(getpostings('read')) | |
write(getpostings('window')) | |
write(getpostings('subject')) | |
sh = shelve.open('store') #opening persistent database | |
sh['index'] = db #storing the index for future reference | |
sh.close() #closing database connection | |
#----------------------------------------------------------------------------------------------- | |
#Helper functions | |
def getpostings(query): #Required function, also formats db query result | |
try: | |
value = db[query] | |
except: | |
return 'Key not found' | |
value[1] = map(lambda x: int(x), value[1]) | |
value[1].sort() | |
fifty = str(value[1][0:50])[1:-1] | |
freq = value[0] | |
docs = value[2] | |
output = ''' | |
%s was found %i times in %i documents. | |
The first 50 were: | |
%s | |
''' % (query.capitalize(), freq, docs, fifty) | |
return output | |
#----------------------------------------------------------------------------------------------- | |
#Class definitions | |
class FileRead(object): | |
def __init__(self, input): | |
self.location = input #for the file open | |
self.filename = input.split('.')[0].split('/')[-1] #for tracking and indexing | |
self.filetype = input.split('.')[1] #just in case | |
try: | |
self.terms = list(self.yield_valid_terms()) #calls read_valid_terms as a list comprehension and returns terms | |
except: | |
print input.upper() + " READ FAILED" | |
self.index = {} #Only initialize if everything else has passed thus far | |
try: | |
self.make_index() #indexes terms into {'term': 'freq'} format | |
except: | |
print self.filename.upper() + '.' + self.filetype.upper() + " INDEX FAILED" | |
try: | |
self.store() #merges index into main database, format is {'term': [freq, [list of doc id's], # of docs found]} | |
except: | |
print self.filename.upper() + '.' + self.filetype.upper() + " STORE FAILED" | |
def yield_valid_terms(self): | |
with open(self.location) as f: #opens file | |
for line in f: #iterates through each line | |
for term in line.split(): #delimits line by whitespace | |
if any(c.isalpha() for c in term): #checks for alpha chars to ensure term validity | |
yield term #what is says | |
def make_index(self): | |
for term in self.terms: | |
try: | |
self.index[term] = self.index[term] + 1 #if term already exists in index, increment freq | |
except: | |
self.index[term] = 1 #if term doesn't exist, create entry with starting freq '1' | |
def store(self): | |
for term in self.index: | |
try: | |
value = db[term] #if term already exists in database, get it | |
value[0] = value[0] + self.index[term] #add to the freq | |
value[1].append(self.filename) #add file id to the doc id list | |
value[2] = len(value[1]) | |
db[term] = value #save that mothaf***er | |
except: | |
db[term] = [self.index[term], [self.filename], 1] #if term isn't found, create entry | |
#----------------------------------------------------------------------------------------------- | |
#Called on execution | |
if __name__ == '__main__': | |
main() #call main function |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment