March 5, 2014 11:43
diff --git a/file1.py b/file1.py
 pwd
 s1 = open('stopwords.txt','r').read().split()
 s1
 s2 = open('../scholarec/corpus/stopwords.txt','r').read().split()
 s2
 set(s1)
 set(s2)
 set(s1)-set(s2)
 l
 ls
 ll stopwords.txt
 ll ../scholarec/corpus/stopwords.txt
 set(s2)-set(s1)
 ls
 cat stopwords.txt| wc -l
 cat ../scholarec/corpus/stopwords.txt| wc -l
 set(s2)+set(s1)
 s1
 s2
 s1+s2
 len(s1+s2)
 s1.extend(s2)
 s1
 len(s1)
 s2 + list(set(s1) - set(s2))
 len(s2 + list(set(s1) - set(s2)))
 import os.path
 import os
 import sys
 import cPickle as pickle
 from string import punctuation
 from operator import itemgetter
 import re
 N= 100
 ls
 len(s2 + list(set(s1) - set(s2)))
 stopwords = len(s2 + list(set(s1) - set(s2)))
 stopwords = [x.strip(punctuation) for x in stopwords if len(x)>2]
 stopwords
 stopwords = s2 + list(set(s1) - set(s2))
 stopwords = [x.strip(punctuation) for x in stopwords if len(x)>2]
 stopwords
 pdfpath = 'paper.pdf'
 picklepath = os.path.join('db', pid, 'topwords.p')
 picklepath = os.path.join('topwords.p')
 picklepath
 cmd = "pdftotext %s %s" % (pdfpath, "out.txt")
 cmd
 rm out.txt
 from subprocess import call
 call(cmd)
 cmd
 ls
 call
 cmd
 cmd.split()
 x = call(cmd.split())
 ls
 rm out.txt
 x = call(cmd.split())
 ls
 x.real
 ls
 txtlst = open("out.txt").read().split()
 words = [x.lower() for x in txtlst if re.match('^[\w-]+$', x) is not None]
 words
 words = [x for x in words if len(x)>2 and (not x in stopwords)]
 words
 len(words)
 pickle.dump(top, open(picklepath, "wb"))
 top = sorted(wcount.iteritems(), key=itemgetter(1), reverse=True)[:N]
 wcount = {}
 wcount.iteritems()
 top = sorted(wcount.iteritems(), key=itemgetter(1), reverse=True)[:N]
 top
 wcount = {}
 top = sorted(wcount.iteritems(), key=itemgetter(1), reverse=True)[:N]
 top
 words
 for w in words: wcount[w] = wcount.get(w, 0) + 1
 wcount
 top = sorted(wcount.iteritems(), key=itemgetter(1), reverse=True)[:N]
 top
 pickle.dump(top, open(picklepath, "wb"))
 ls
 less topwords.p
 picklepath
 twords = pickle.load(open(picklepath, "rb"))
 twords
 dict(iter(twords))
 %pastebin 1-91
	pwd
	s1 = open('stopwords.txt','r').read().split()
	s1
	s2 = open('../scholarec/corpus/stopwords.txt','r').read().split()
	s2
	set(s1)
	set(s2)
	set(s1)-set(s2)
	l
	ls
	ll stopwords.txt
	ll ../scholarec/corpus/stopwords.txt
	set(s2)-set(s1)
	ls
	cat stopwords.txt\| wc -l
	cat ../scholarec/corpus/stopwords.txt\| wc -l
	set(s2)+set(s1)
	s1
	s2
	s1+s2
	len(s1+s2)
	s1.extend(s2)
	s1
	len(s1)
	s2 + list(set(s1) - set(s2))
	len(s2 + list(set(s1) - set(s2)))
	import os.path
	import os
	import sys
	import cPickle as pickle
	from string import punctuation
	from operator import itemgetter
	import re
	N= 100
	ls
	len(s2 + list(set(s1) - set(s2)))
	stopwords = len(s2 + list(set(s1) - set(s2)))
	stopwords = [x.strip(punctuation) for x in stopwords if len(x)>2]
	stopwords
	stopwords = s2 + list(set(s1) - set(s2))
	stopwords = [x.strip(punctuation) for x in stopwords if len(x)>2]
	stopwords
	pdfpath = 'paper.pdf'
	picklepath = os.path.join('db', pid, 'topwords.p')
	picklepath = os.path.join('topwords.p')
	picklepath
	cmd = "pdftotext %s %s" % (pdfpath, "out.txt")
	cmd
	rm out.txt
	from subprocess import call
	call(cmd)
	cmd
	ls
	call
	cmd
	cmd.split()
	x = call(cmd.split())
	ls
	rm out.txt
	x = call(cmd.split())
	ls
	x.real
	ls
	txtlst = open("out.txt").read().split()
	words = [x.lower() for x in txtlst if re.match('^[\w-]+$', x) is not None]
	words
	words = [x for x in words if len(x)>2 and (not x in stopwords)]
	words
	len(words)
	pickle.dump(top, open(picklepath, "wb"))
	top = sorted(wcount.iteritems(), key=itemgetter(1), reverse=True)[:N]
	wcount = {}
	wcount.iteritems()
	top = sorted(wcount.iteritems(), key=itemgetter(1), reverse=True)[:N]
	top
	wcount = {}
	top = sorted(wcount.iteritems(), key=itemgetter(1), reverse=True)[:N]
	top
	words
	for w in words: wcount[w] = wcount.get(w, 0) + 1
	wcount
	top = sorted(wcount.iteritems(), key=itemgetter(1), reverse=True)[:N]
	top
	pickle.dump(top, open(picklepath, "wb"))
	ls
	less topwords.p
	picklepath
	twords = pickle.load(open(picklepath, "rb"))
	twords
	dict(iter(twords))
	%pastebin 1-91
No results found