Created
February 22, 2016 08:09
-
-
Save ernestum/7067f67e80eeba29a587 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import math | |
import re | |
import time | |
def extractTerms(document): | |
''' | |
Extracts all the terms (lowercased) from a document string. | |
''' | |
return [t.lower() for t in re.split("\W+", document)] | |
def loadSlides(filename="allslidescropped.txt"): | |
documents = [] | |
with open(filename, 'r') as fh: | |
currentDoc = "" | |
for lid, line in enumerate(fh): | |
cleanline = line.strip().replace(u'\uf06e', " ") | |
if line.startswith(u'\u000c'): | |
documents.append(currentDoc) | |
currentDoc = cleanline + "\t" | |
else: | |
currentDoc = currentDoc + " " + cleanline | |
return documents | |
def constructInvertedListsAndDocsizes(documents): | |
''' | |
Creates a dictionary of inverted lists for each term in the documents. | |
Also creates a dictionary mapping docids to document lengths. | |
''' | |
invLists = dict() | |
documentLengths = dict() | |
for docid, doc in enumerate(documents): | |
terms = extractTerms(doc) | |
documentLengths[docid] = len(terms) | |
for term in terms: | |
if term not in invLists: | |
invLists[term] = [] | |
invLists[term].append(docid) | |
for l in invLists.values(): | |
l.sort() | |
return invLists, documentLengths | |
def mergeInvertedLitsts(list1, list2): | |
merged = [] | |
cur1 = cur2 = 0 | |
while cur1 < len(list1) and cur2 < len(list2): | |
if list1[cur1] < list2[cur2]: | |
merged.append(list1[cur1]) | |
cur1 = cur1 + 1 | |
else: | |
merged.append(list2[cur2]) | |
cur2 = cur2 + 1 | |
return merged + list1[cur1:] + list2[cur2:] | |
def computeTermFrequencies(invertedList): | |
''' | |
Given an inverted list (a list of docids where a term or multiple terms | |
appear in) that might contain some docids multiple times, computes a list | |
of touples containing the docid and the number of times it occured in the | |
inverted list. | |
''' | |
counts = dict() | |
for docid in invertedList: | |
if docid not in counts: | |
counts[docid] = 0 | |
counts[docid] = counts[docid] + 1 | |
return sorted([(docid, count) for docid, count in counts.items()], | |
key = lambda t: t[1]) | |
def computeBM25(term, invertedLists, documentLenghts, k=1.75, b=0.75): | |
''' | |
Computes the BM25 score for a term on all the doccuments it occurs in. | |
@param term: The term we are interested in | |
@param invertedLists: a dictionary mapping terms to lists of docids they appear in | |
@param documentLength: a dictionary mapping docids to document lengths | |
@param k: A parameter for the BM25 score | |
@param b: A parameter for the BM25 score | |
@return: A dictionary mapping docids to the BM25 score for the given term. | |
''' | |
tfs = dict(computeTermFrequencies(invertedLists[term])) | |
avdl = sum(documentLenghts.values())/len(documentLenghts) | |
tf_stars = {docid : tfs[docid] * (k+1)/(k * (1 - b + b * documentLenghts[docid]/avdl) + tfs[docid]) for docid in tfs.keys()} | |
idf = math.log(len(documentLenghts)/float(len(tfs)), 2) | |
return {docid : tf_stars[docid] * idf for docid in tfs.keys()} | |
def mergeBM25Scores(bm25scores): | |
''' | |
Merges dictionarys of BM25 scores that were computed for different terms. | |
Returns a new dictionary where the scores are just added up for each document. | |
''' | |
mergedscore = dict() | |
for bm25score in bm25scores: | |
for docid in bm25score.keys(): | |
if docid not in mergedscore: | |
mergedscore[docid] = 0 | |
mergedscore[docid] = mergedscore[docid] + bm25score[docid] | |
return mergedscore | |
def processQuery(query_terms, invertedLists, documentLengths): | |
query_terms = set(query_terms) | |
bm25score = mergeBM25Scores([computeBM25(term, invertedLists, documentLengths) for term in query_terms if term in invertedLists]) | |
return list(reversed(sorted(bm25score.items(), key=lambda i: i[1]))) | |
def docdescription(docid, slides, slidesPerPage=16): | |
printpage = docid/slidesPerPage + 1 | |
idonpage = docid % slidesPerPage | |
slidesPerRow = int(math.sqrt(slidesPerPage)) | |
row = idonpage / slidesPerRow | |
col = idonpage % slidesPerRow | |
arrow = "o" | |
if row < slidesPerRow/2.: | |
if col < slidesPerRow/2.: | |
arrow = "↖" | |
if col >= slidesPerRow/2.: | |
arrow = "↗" | |
if row >= slidesPerRow/2.: | |
if col < slidesPerRow/2.: | |
arrow = "↙" | |
if col >= slidesPerRow/2.: | |
arrow = "↘" | |
if arrow == "o": | |
print row, col | |
return str(printpage) + arrow #+ slides[docid].split("\t")[0] | |
def docdescriptionLATEX(docid, slides, slidesPerPage=16): | |
printpage = docid/slidesPerPage + 1 | |
idonpage = docid % slidesPerPage | |
slidesPerRow = int(math.sqrt(slidesPerPage)) | |
row = idonpage / slidesPerRow | |
col = idonpage % slidesPerRow | |
arrow = "o" | |
if row < slidesPerRow/2.: | |
if col < slidesPerRow/2.: | |
arrow = "$\\nwarrow$" | |
if col >= slidesPerRow/2.: | |
arrow = "$\\nearrow$" | |
if row >= slidesPerRow/2.: | |
if col < slidesPerRow/2.: | |
arrow = "$\\swarrow$" | |
if col >= slidesPerRow/2.: | |
arrow = "$\\searrow$" | |
if arrow == "o": | |
print row, col | |
return str(printpage) + arrow + slides[docid].split("\t")[0].replace("_", "\\_") | |
if __name__ == "__main__": | |
slides = loadSlides() | |
print "wait ..." | |
invLists, docsizes = constructInvertedListsAndDocsizes(slides) | |
# sortedTerms = sorted(invLists.keys(), key=lambda term: sum(computeBM25(term, invLists, docsizes, k=1.75, b=0.1).values())) | |
# print sortedTerms | |
# print sortedTerms | |
with open("index.tex", "w") as fh: | |
sortedTerms = sorted(invLists.keys()) | |
blacklist = set(["all", "and", "arbitrary", "ary", "ask", "avoid", "bbbaab", "bbaabbab", "can", "d", "d2", "d3", "d5", "d6", "do", "doesn", "doing", "each", "for", "follows", "from", "further", "getting", "given", "gives", "has", "have", "how", "in", "ir", "issues", "just", "lecture", "less", "more", "most", "mostly", "n", "not", "of", "on", "once", "one", "that", "the", "than", "their", "then", "them", "there", "third", "to", "today", "twelve", "twice", "two", "us", "used", "useful", "usually", "we", "way", "what", "whatever", "when", "whenever", "where", "which", "whereas", "whereever", "whether", "will", "with", "york"]) | |
for term in sortedTerms[sortedTerms.index("above"):]: | |
if term not in blacklist: | |
# fh.write("<b>   {}:</b>".format(term)) | |
fh.write("\\textbf{" + term.replace("_", "\\_") + ":} ") | |
fh.write(", ".join([docdescriptionLATEX(docid, slides).replace("_", "\\_") for docid, score in sorted(computeBM25(term, invLists, docsizes, k=1.75, b=0.75).items(), key=lambda i:i[1])])) | |
fh.write("\n") | |
# while True: | |
# terms = extractTerms(raw_input("> ")) | |
# t0 = time.time() | |
# q_result = processQuery(terms, invLists, docsizes) | |
# print "{} results in {}s".format(len(q_result), time.time() - t0) | |
# for docid, score in q_result[:5]: | |
# print "\n", score, "\n", slides[docid] | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This script assumes that it is in a folder together with a file called 'allslidescropped.txt'. This file should contain the text of all slides. I generated it by assembling all slides to one huge PDF (pdfshuffler) and cropping the slides (krop). Then they were fed throug the unix utility pdftotext. Go to line 159 and remove everything after 'arrow' to get a more compact version.
The script generates an index.tex file that can be included in a main text file. Mine looks like this:
For each term you get a list of pagenumbers and arrows. They assume that you print the slides in a 4x4 grid. The page numbers are then according to those grid pages and the arrow point in the general direction of the term on the page (top right, bottom right, top left, bottom left).