This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Given a text string, remove all non-alphanumeric | |
# characters (using Unicode definition of alphanumeric). | |
def stripNonAlphaNum(text): | |
import re | |
return re.compile(r'\W+', re.UNICODE).split(text) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#html-to-list1.py | |
import urllib2, obo | |
url = 'http://www.oldbaileyonline.org/print.jsp?div=t17800628-33' | |
response = urllib2.urlopen(url) | |
html = response.read() | |
text = obo.stripTags(html).lower() | |
wordlist = obo.stripNonAlphaNum(text) | |
print wordlist[0:500] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
s = 'hello world' | |
print s[0] | |
-> h | |
print s[1] | |
-> e | |
m = ['hello', 'world'] | |
print m[0] | |
-> hello |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
d = {'world': 1, 'hello': 0} | |
print d['hello'] | |
-> 0 | |
print d['world'] | |
-> 1 | |
print d.keys() | |
-> ['world', 'hello'] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# count-list-items-1.py | |
wordstring = 'it was the best of times it was the worst of times ' | |
wordstring += 'it was the age of wisdom it was the age of foolishness' | |
wordlist = wordstring.split() | |
wordfreq = [] | |
for w in wordlist: | |
wordfreq.append(wordlist.count(w)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
String | |
it was the best of times it was the worst of times it was the age of wisdom it was the age of foolishness | |
List | |
['it', 'was', 'the', 'best', 'of', 'times', 'it', 'was', | |
'the', 'worst', 'of', 'times', 'it', 'was', 'the', 'age', | |
'of', 'wisdom', 'it', 'was', 'the', 'age', 'of', | |
'foolishness'] | |
Frequencies |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# count-list-items-1.py | |
wordstring = 'it was the best of times it was the worst of times ' | |
wordstring += 'it was the age of wisdom it was the age of foolishness' | |
wordlist = wordstring.split() | |
wordfreq = [wordlist.count(w) for w in wordlist] # a list comprehension | |
print "String\n" + wordstring +"\n" | |
print "List\n" + str(wordlist) + "\n" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Given a list of words, return a dictionary of | |
# word-frequency pairs. | |
def wordListToFreqDict(wordlist): | |
wordfreq = [wordlist.count(p) for p in wordlist] | |
return dict(zip(wordlist,wordfreq)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Sort a dictionary of word-frequency pairs in | |
# order of descending frequency. | |
def sortFreqDict(freqdict): | |
aux = [(freqdict[key], key) for key in freqdict] | |
aux.sort() | |
aux.reverse() | |
return aux |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#html-to-freq.py | |
import urllib2, obo | |
url = 'http://www.oldbaileyonline.org/print.jsp?div=t17800628-33' | |
response = urllib2.urlopen(url) | |
html = response.read() | |
text = obo.stripTags(html).lower() | |
wordlist = obo.stripNonAlphaNum(text) |