ernestum · February 22, 2016 08:09 · ernestum · Feb 22, 2016
diff --git a/inverted_slides.py b/inverted_slides.py
 # -*- coding: utf-8 -*-
 import math
 import re
 import time

 def extractTerms(document):
    '''
    Extracts all the terms (lowercased) from a document string.
    '''
    return [t.lower() for t in re.split("\W+", document)]

 def loadSlides(filename="allslidescropped.txt"):
    documents = []

    with open(filename, 'r') as fh:
        currentDoc = ""
        for lid, line in enumerate(fh):

            cleanline = line.strip().replace(u'\uf06e', " ")
            if line.startswith(u'\u000c'):
                documents.append(currentDoc)
                currentDoc = cleanline + "\t"
            else:
                currentDoc = currentDoc + " " + cleanline
    return documents


 def constructInvertedListsAndDocsizes(documents):
    '''
    Creates a dictionary of inverted lists for each term in the documents.
    Also creates a dictionary mapping docids to document lengths.
    '''
    invLists = dict()
    documentLengths = dict()
    for docid, doc in enumerate(documents):
        terms =  extractTerms(doc)
        documentLengths[docid] = len(terms)
        for term in terms:
            if term not in invLists:
                invLists[term] = []
            invLists[term].append(docid)


    for l in invLists.values():
        l.sort()
    return invLists, documentLengths

 def mergeInvertedLitsts(list1, list2):
    merged = []
    cur1 = cur2 = 0
    while cur1 < len(list1) and cur2 < len(list2):
        if list1[cur1] < list2[cur2]:
            merged.append(list1[cur1])
            cur1 = cur1 + 1
        else:
            merged.append(list2[cur2])
            cur2 = cur2 + 1

    return merged + list1[cur1:] + list2[cur2:]

 def computeTermFrequencies(invertedList):
    '''
    Given an inverted list (a list of docids where a term or multiple terms
    appear in) that might contain some docids multiple times, computes a list
    of touples containing the docid and the number of times it occured in the
    inverted list.
    '''
    counts = dict()
    for docid in invertedList:
        if docid not in counts:
            counts[docid] = 0
        counts[docid] = counts[docid] + 1
    return sorted([(docid, count) for docid, count in counts.items()],
                   key = lambda t: t[1])


 def computeBM25(term, invertedLists, documentLenghts, k=1.75, b=0.75):
    '''
    Computes the BM25 score for a term on all the doccuments it occurs in.
    @param term: The term we are interested in
    @param invertedLists: a dictionary mapping terms to lists of docids they appear in
    @param documentLength: a dictionary mapping docids to document lengths
    @param k: A parameter for the BM25 score
    @param b: A parameter for the BM25 score
    @return: A dictionary mapping docids to the BM25 score for the given term.
    '''
    tfs = dict(computeTermFrequencies(invertedLists[term]))
    avdl = sum(documentLenghts.values())/len(documentLenghts)
    tf_stars = {docid : tfs[docid] * (k+1)/(k * (1 - b + b * documentLenghts[docid]/avdl) + tfs[docid]) for docid in tfs.keys()}
    idf = math.log(len(documentLenghts)/float(len(tfs)), 2)
    return {docid : tf_stars[docid] * idf for docid in tfs.keys()}

 def mergeBM25Scores(bm25scores):
    '''
    Merges dictionarys of BM25 scores that were computed for different terms.
    Returns a new dictionary where the scores are just added up for each document.
    '''
    mergedscore = dict()
    for bm25score in bm25scores:
        for docid in bm25score.keys():
            if docid not in mergedscore:
                mergedscore[docid] = 0
            mergedscore[docid] = mergedscore[docid] + bm25score[docid]
    return mergedscore

 def processQuery(query_terms, invertedLists, documentLengths):
    query_terms = set(query_terms)
    bm25score = mergeBM25Scores([computeBM25(term, invertedLists, documentLengths) for term in query_terms if term in invertedLists])
    return list(reversed(sorted(bm25score.items(), key=lambda i: i[1])))

 def docdescription(docid, slides, slidesPerPage=16):
    printpage = docid/slidesPerPage + 1
    idonpage = docid % slidesPerPage
    slidesPerRow = int(math.sqrt(slidesPerPage))
    row = idonpage / slidesPerRow
    col = idonpage % slidesPerRow
    arrow = "o"
    if row < slidesPerRow/2.:
        if col < slidesPerRow/2.:
            arrow = "&nwarr;"
        if col >= slidesPerRow/2.:
            arrow = "&nearr;"
    if row >= slidesPerRow/2.:
        if col < slidesPerRow/2.:
            arrow = "&swarr;"
        if col >= slidesPerRow/2.:
            arrow = "&searr;"

    if arrow == "o":
        print row, col


    return str(printpage) + arrow #+ slides[docid].split("\t")[0]



 def docdescriptionLATEX(docid, slides, slidesPerPage=16):
    printpage = docid/slidesPerPage + 1
    idonpage = docid % slidesPerPage
    slidesPerRow = int(math.sqrt(slidesPerPage))
    row = idonpage / slidesPerRow
    col = idonpage % slidesPerRow
    arrow = "o"
    if row < slidesPerRow/2.:
        if col < slidesPerRow/2.:
            arrow = "$\\nwarrow$"
        if col >= slidesPerRow/2.:
            arrow = "$\\nearrow$"
    if row >= slidesPerRow/2.:
        if col < slidesPerRow/2.:
            arrow = "$\\swarrow$"
        if col >= slidesPerRow/2.:
            arrow = "$\\searrow$"

    if arrow == "o":
        print row, col


    return str(printpage) + arrow + slides[docid].split("\t")[0].replace("_", "\\_")

 if __name__ == "__main__":
    slides = loadSlides()
    print "wait ..."
    invLists, docsizes = constructInvertedListsAndDocsizes(slides)
 #    sortedTerms = sorted(invLists.keys(), key=lambda term: sum(computeBM25(term, invLists, docsizes, k=1.75, b=0.1).values()))
 #    print sortedTerms
 #    print sortedTerms
    with open("index.tex", "w") as fh:
        sortedTerms = sorted(invLists.keys())
        blacklist = set(["all", "and", "arbitrary", "ary", "ask", "avoid", "bbbaab", "bbaabbab", "can", "d", "d2", "d3", "d5", "d6", "do", "doesn", "doing", "each", "for", "follows", "from", "further", "getting", "given", "gives", "has", "have", "how", "in", "ir", "issues", "just", "lecture", "less", "more", "most", "mostly", "n", "not", "of", "on", "once", "one", "that", "the", "than", "their", "then", "them", "there", "third", "to", "today", "twelve", "twice", "two", "us", "used", "useful", "usually", "we", "way", "what", "whatever", "when", "whenever", "where", "which", "whereas", "whereever", "whether", "will", "with", "york"])
        for term in sortedTerms[sortedTerms.index("above"):]:
            if term not in blacklist:
 #                fh.write("<b>&#160;&#160;&#160;{}:</b>".format(term))
                fh.write("\\textbf{" + term.replace("_", "\\_") + ":} ")
                fh.write(", ".join([docdescriptionLATEX(docid, slides).replace("_", "\\_") for docid, score in sorted(computeBM25(term, invLists, docsizes, k=1.75, b=0.75).items(), key=lambda i:i[1])]))
                fh.write("\n")
 #    while True:
 #        terms = extractTerms(raw_input("> "))
 #        t0 = time.time()
 #        q_result = processQuery(terms, invLists, docsizes)
 #        print "{} results in {}s".format(len(q_result), time.time() - t0)
 #        for docid, score in q_result[:5]:
 #            print "\n", score, "\n", slides[docid]
	# -- coding: utf-8 --
	import math
	import re
	import time

	def extractTerms(document):
	'''
	Extracts all the terms (lowercased) from a document string.
	'''
	return [t.lower() for t in re.split("\W+", document)]

	def loadSlides(filename="allslidescropped.txt"):
	documents = []

	with open(filename, 'r') as fh:
	currentDoc = ""
	for lid, line in enumerate(fh):

	cleanline = line.strip().replace(u'\uf06e', " ")
	if line.startswith(u'\u000c'):
	documents.append(currentDoc)
	currentDoc = cleanline + "\t"
	else:
	currentDoc = currentDoc + " " + cleanline
	return documents


	def constructInvertedListsAndDocsizes(documents):
	'''
	Creates a dictionary of inverted lists for each term in the documents.
	Also creates a dictionary mapping docids to document lengths.
	'''
	invLists = dict()
	documentLengths = dict()
	for docid, doc in enumerate(documents):
	terms = extractTerms(doc)
	documentLengths[docid] = len(terms)
	for term in terms:
	if term not in invLists:
	invLists[term] = []
	invLists[term].append(docid)


	for l in invLists.values():
	l.sort()
	return invLists, documentLengths

	def mergeInvertedLitsts(list1, list2):
	merged = []
	cur1 = cur2 = 0
	while cur1 < len(list1) and cur2 < len(list2):
	if list1[cur1] < list2[cur2]:
	merged.append(list1[cur1])
	cur1 = cur1 + 1
	else:
	merged.append(list2[cur2])
	cur2 = cur2 + 1

	return merged + list1[cur1:] + list2[cur2:]

	def computeTermFrequencies(invertedList):
	'''
	Given an inverted list (a list of docids where a term or multiple terms
	appear in) that might contain some docids multiple times, computes a list
	of touples containing the docid and the number of times it occured in the
	inverted list.
	'''
	counts = dict()
	for docid in invertedList:
	if docid not in counts:
	counts[docid] = 0
	counts[docid] = counts[docid] + 1
	return sorted([(docid, count) for docid, count in counts.items()],
	key = lambda t: t[1])


	def computeBM25(term, invertedLists, documentLenghts, k=1.75, b=0.75):
	'''
	Computes the BM25 score for a term on all the doccuments it occurs in.
	@param term: The term we are interested in
	@param invertedLists: a dictionary mapping terms to lists of docids they appear in
	@param documentLength: a dictionary mapping docids to document lengths
	@param k: A parameter for the BM25 score
	@param b: A parameter for the BM25 score
	@return: A dictionary mapping docids to the BM25 score for the given term.
	'''
	tfs = dict(computeTermFrequencies(invertedLists[term]))
	avdl = sum(documentLenghts.values())/len(documentLenghts)
	tf_stars = {docid : tfs[docid] * (k+1)/(k * (1 - b + b * documentLenghts[docid]/avdl) + tfs[docid]) for docid in tfs.keys()}
	idf = math.log(len(documentLenghts)/float(len(tfs)), 2)
	return {docid : tf_stars[docid] * idf for docid in tfs.keys()}

	def mergeBM25Scores(bm25scores):
	'''
	Merges dictionarys of BM25 scores that were computed for different terms.
	Returns a new dictionary where the scores are just added up for each document.
	'''
	mergedscore = dict()
	for bm25score in bm25scores:
	for docid in bm25score.keys():
	if docid not in mergedscore:
	mergedscore[docid] = 0
	mergedscore[docid] = mergedscore[docid] + bm25score[docid]
	return mergedscore

	def processQuery(query_terms, invertedLists, documentLengths):
	query_terms = set(query_terms)
	bm25score = mergeBM25Scores([computeBM25(term, invertedLists, documentLengths) for term in query_terms if term in invertedLists])
	return list(reversed(sorted(bm25score.items(), key=lambda i: i[1])))

	def docdescription(docid, slides, slidesPerPage=16):
	printpage = docid/slidesPerPage + 1
	idonpage = docid % slidesPerPage
	slidesPerRow = int(math.sqrt(slidesPerPage))
	row = idonpage / slidesPerRow
	col = idonpage % slidesPerRow
	arrow = "o"
	if row < slidesPerRow/2.:
	if col < slidesPerRow/2.:
	arrow = "&nwarr;"
	if col >= slidesPerRow/2.:
	arrow = "&nearr;"
	if row >= slidesPerRow/2.:
	if col < slidesPerRow/2.:
	arrow = "&swarr;"
	if col >= slidesPerRow/2.:
	arrow = "&searr;"

	if arrow == "o":
	print row, col


	return str(printpage) + arrow #+ slides[docid].split("\t")[0]



	def docdescriptionLATEX(docid, slides, slidesPerPage=16):
	printpage = docid/slidesPerPage + 1
	idonpage = docid % slidesPerPage
	slidesPerRow = int(math.sqrt(slidesPerPage))
	row = idonpage / slidesPerRow
	col = idonpage % slidesPerRow
	arrow = "o"
	if row < slidesPerRow/2.:
	if col < slidesPerRow/2.:
	arrow = "$\\nwarrow$"
	if col >= slidesPerRow/2.:
	arrow = "$\\nearrow$"
	if row >= slidesPerRow/2.:
	if col < slidesPerRow/2.:
	arrow = "$\\swarrow$"
	if col >= slidesPerRow/2.:
	arrow = "$\\searrow$"

	if arrow == "o":
	print row, col


	return str(printpage) + arrow + slides[docid].split("\t")[0].replace("_", "\\_")

	if __name__ == "__main__":
	slides = loadSlides()
	print "wait ..."
	invLists, docsizes = constructInvertedListsAndDocsizes(slides)
	# sortedTerms = sorted(invLists.keys(), key=lambda term: sum(computeBM25(term, invLists, docsizes, k=1.75, b=0.1).values()))
	# print sortedTerms
	# print sortedTerms
	with open("index.tex", "w") as fh:
	sortedTerms = sorted(invLists.keys())
	blacklist = set(["all", "and", "arbitrary", "ary", "ask", "avoid", "bbbaab", "bbaabbab", "can", "d", "d2", "d3", "d5", "d6", "do", "doesn", "doing", "each", "for", "follows", "from", "further", "getting", "given", "gives", "has", "have", "how", "in", "ir", "issues", "just", "lecture", "less", "more", "most", "mostly", "n", "not", "of", "on", "once", "one", "that", "the", "than", "their", "then", "them", "there", "third", "to", "today", "twelve", "twice", "two", "us", "used", "useful", "usually", "we", "way", "what", "whatever", "when", "whenever", "where", "which", "whereas", "whereever", "whether", "will", "with", "york"])
	for term in sortedTerms[sortedTerms.index("above"):]:
	if term not in blacklist:
	# fh.write("<b>   {}:</b>".format(term))
	fh.write("\\textbf{" + term.replace("_", "\\_") + ":} ")
	fh.write(", ".join([docdescriptionLATEX(docid, slides).replace("_", "\\_") for docid, score in sorted(computeBM25(term, invLists, docsizes, k=1.75, b=0.75).items(), key=lambda i:i[1])]))
	fh.write("\n")
	# while True:
	# terms = extractTerms(raw_input("> "))
	# t0 = time.time()
	# q_result = processQuery(terms, invLists, docsizes)
	# print "{} results in {}s".format(len(q_result), time.time() - t0)
	# for docid, score in q_result[:5]:
	# print "\n", score, "\n", slides[docid]