Skip to content

Instantly share code, notes, and snippets.

@ernestum
Created February 22, 2016 08:09
Show Gist options
  • Save ernestum/7067f67e80eeba29a587 to your computer and use it in GitHub Desktop.
Save ernestum/7067f67e80eeba29a587 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import math
import re
import time
def extractTerms(document):
'''
Extracts all the terms (lowercased) from a document string.
'''
return [t.lower() for t in re.split("\W+", document)]
def loadSlides(filename="allslidescropped.txt"):
documents = []
with open(filename, 'r') as fh:
currentDoc = ""
for lid, line in enumerate(fh):
cleanline = line.strip().replace(u'\uf06e', " ")
if line.startswith(u'\u000c'):
documents.append(currentDoc)
currentDoc = cleanline + "\t"
else:
currentDoc = currentDoc + " " + cleanline
return documents
def constructInvertedListsAndDocsizes(documents):
'''
Creates a dictionary of inverted lists for each term in the documents.
Also creates a dictionary mapping docids to document lengths.
'''
invLists = dict()
documentLengths = dict()
for docid, doc in enumerate(documents):
terms = extractTerms(doc)
documentLengths[docid] = len(terms)
for term in terms:
if term not in invLists:
invLists[term] = []
invLists[term].append(docid)
for l in invLists.values():
l.sort()
return invLists, documentLengths
def mergeInvertedLitsts(list1, list2):
merged = []
cur1 = cur2 = 0
while cur1 < len(list1) and cur2 < len(list2):
if list1[cur1] < list2[cur2]:
merged.append(list1[cur1])
cur1 = cur1 + 1
else:
merged.append(list2[cur2])
cur2 = cur2 + 1
return merged + list1[cur1:] + list2[cur2:]
def computeTermFrequencies(invertedList):
'''
Given an inverted list (a list of docids where a term or multiple terms
appear in) that might contain some docids multiple times, computes a list
of touples containing the docid and the number of times it occured in the
inverted list.
'''
counts = dict()
for docid in invertedList:
if docid not in counts:
counts[docid] = 0
counts[docid] = counts[docid] + 1
return sorted([(docid, count) for docid, count in counts.items()],
key = lambda t: t[1])
def computeBM25(term, invertedLists, documentLenghts, k=1.75, b=0.75):
'''
Computes the BM25 score for a term on all the doccuments it occurs in.
@param term: The term we are interested in
@param invertedLists: a dictionary mapping terms to lists of docids they appear in
@param documentLength: a dictionary mapping docids to document lengths
@param k: A parameter for the BM25 score
@param b: A parameter for the BM25 score
@return: A dictionary mapping docids to the BM25 score for the given term.
'''
tfs = dict(computeTermFrequencies(invertedLists[term]))
avdl = sum(documentLenghts.values())/len(documentLenghts)
tf_stars = {docid : tfs[docid] * (k+1)/(k * (1 - b + b * documentLenghts[docid]/avdl) + tfs[docid]) for docid in tfs.keys()}
idf = math.log(len(documentLenghts)/float(len(tfs)), 2)
return {docid : tf_stars[docid] * idf for docid in tfs.keys()}
def mergeBM25Scores(bm25scores):
'''
Merges dictionarys of BM25 scores that were computed for different terms.
Returns a new dictionary where the scores are just added up for each document.
'''
mergedscore = dict()
for bm25score in bm25scores:
for docid in bm25score.keys():
if docid not in mergedscore:
mergedscore[docid] = 0
mergedscore[docid] = mergedscore[docid] + bm25score[docid]
return mergedscore
def processQuery(query_terms, invertedLists, documentLengths):
query_terms = set(query_terms)
bm25score = mergeBM25Scores([computeBM25(term, invertedLists, documentLengths) for term in query_terms if term in invertedLists])
return list(reversed(sorted(bm25score.items(), key=lambda i: i[1])))
def docdescription(docid, slides, slidesPerPage=16):
printpage = docid/slidesPerPage + 1
idonpage = docid % slidesPerPage
slidesPerRow = int(math.sqrt(slidesPerPage))
row = idonpage / slidesPerRow
col = idonpage % slidesPerRow
arrow = "o"
if row < slidesPerRow/2.:
if col < slidesPerRow/2.:
arrow = "&nwarr;"
if col >= slidesPerRow/2.:
arrow = "&nearr;"
if row >= slidesPerRow/2.:
if col < slidesPerRow/2.:
arrow = "&swarr;"
if col >= slidesPerRow/2.:
arrow = "&searr;"
if arrow == "o":
print row, col
return str(printpage) + arrow #+ slides[docid].split("\t")[0]
def docdescriptionLATEX(docid, slides, slidesPerPage=16):
printpage = docid/slidesPerPage + 1
idonpage = docid % slidesPerPage
slidesPerRow = int(math.sqrt(slidesPerPage))
row = idonpage / slidesPerRow
col = idonpage % slidesPerRow
arrow = "o"
if row < slidesPerRow/2.:
if col < slidesPerRow/2.:
arrow = "$\\nwarrow$"
if col >= slidesPerRow/2.:
arrow = "$\\nearrow$"
if row >= slidesPerRow/2.:
if col < slidesPerRow/2.:
arrow = "$\\swarrow$"
if col >= slidesPerRow/2.:
arrow = "$\\searrow$"
if arrow == "o":
print row, col
return str(printpage) + arrow + slides[docid].split("\t")[0].replace("_", "\\_")
if __name__ == "__main__":
slides = loadSlides()
print "wait ..."
invLists, docsizes = constructInvertedListsAndDocsizes(slides)
# sortedTerms = sorted(invLists.keys(), key=lambda term: sum(computeBM25(term, invLists, docsizes, k=1.75, b=0.1).values()))
# print sortedTerms
# print sortedTerms
with open("index.tex", "w") as fh:
sortedTerms = sorted(invLists.keys())
blacklist = set(["all", "and", "arbitrary", "ary", "ask", "avoid", "bbbaab", "bbaabbab", "can", "d", "d2", "d3", "d5", "d6", "do", "doesn", "doing", "each", "for", "follows", "from", "further", "getting", "given", "gives", "has", "have", "how", "in", "ir", "issues", "just", "lecture", "less", "more", "most", "mostly", "n", "not", "of", "on", "once", "one", "that", "the", "than", "their", "then", "them", "there", "third", "to", "today", "twelve", "twice", "two", "us", "used", "useful", "usually", "we", "way", "what", "whatever", "when", "whenever", "where", "which", "whereas", "whereever", "whether", "will", "with", "york"])
for term in sortedTerms[sortedTerms.index("above"):]:
if term not in blacklist:
# fh.write("<b>&#160;&#160;&#160;{}:</b>".format(term))
fh.write("\\textbf{" + term.replace("_", "\\_") + ":} ")
fh.write(", ".join([docdescriptionLATEX(docid, slides).replace("_", "\\_") for docid, score in sorted(computeBM25(term, invLists, docsizes, k=1.75, b=0.75).items(), key=lambda i:i[1])]))
fh.write("\n")
# while True:
# terms = extractTerms(raw_input("> "))
# t0 = time.time()
# q_result = processQuery(terms, invLists, docsizes)
# print "{} results in {}s".format(len(q_result), time.time() - t0)
# for docid, score in q_result[:5]:
# print "\n", score, "\n", slides[docid]
@ernestum
Copy link
Author

This script assumes that it is in a folder together with a file called 'allslidescropped.txt'. This file should contain the text of all slides. I generated it by assembling all slides to one huge PDF (pdfshuffler) and cropping the slides (krop). Then they were fed throug the unix utility pdftotext. Go to line 159 and remove everything after 'arrow' to get a more compact version.
The script generates an index.tex file that can be included in a main text file. Mine looks like this:

\documentclass[12pt,a4paper]{article}
\usepackage[utf8]{inputenc}
\usepackage{graphicx}
\usepackage[left=0.2cm, right=0.20cm, top=0.20cm, bottom=0.20cm]{geometry}
\begin{document}
\noindent
\input{index.tex}
\end{document}

For each term you get a list of pagenumbers and arrows. They assume that you print the slides in a 4x4 grid. The page numbers are then according to those grid pages and the arrow point in the general direction of the term on the page (top right, bottom right, top left, bottom left).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment