Skip to content

Instantly share code, notes, and snippets.

View acrymble's full-sized avatar

Adam Crymble acrymble

  • London
View GitHub Profile
@acrymble
acrymble / stripNonAlphaNum.py
Created July 5, 2011 19:15
Strip Non Alpha-Numeric
# Given a text string, remove all non-alphanumeric
# characters (using Unicode definition of alphanumeric).
def stripNonAlphaNum(text):
import re
return re.compile(r'\W+', re.UNICODE).split(text)
@acrymble
acrymble / html-to-list1.py
Created July 5, 2011 19:16
HTML to a List of Words
#html-to-list1.py
import urllib2, obo
url = 'http://www.oldbaileyonline.org/print.jsp?div=t17800628-33'
response = urllib2.urlopen(url)
html = response.read()
text = obo.stripTags(html).lower()
wordlist = obo.stripNonAlphaNum(text)
print wordlist[0:500]
@acrymble
acrymble / indicies.py
Created July 5, 2011 19:17
String Indicies
s = 'hello world'
print s[0]
-> h
print s[1]
-> e
m = ['hello', 'world']
print m[0]
-> hello
@acrymble
acrymble / dictionary.py
Created July 5, 2011 19:18
Python Dictionary
d = {'world': 1, 'hello': 0}
print d['hello']
-> 0
print d['world']
-> 1
print d.keys()
-> ['world', 'hello']
@acrymble
acrymble / count-list-items-1.py
Created July 5, 2011 19:20
Python count items in a list
# count-list-items-1.py
wordstring = 'it was the best of times it was the worst of times '
wordstring += 'it was the age of wisdom it was the age of foolishness'
wordlist = wordstring.split()
wordfreq = []
for w in wordlist:
wordfreq.append(wordlist.count(w))
@acrymble
acrymble / count-list-items-1-result.py
Created July 5, 2011 19:24
Python count items in a list result
String
it was the best of times it was the worst of times it was the age of wisdom it was the age of foolishness
List
['it', 'was', 'the', 'best', 'of', 'times', 'it', 'was',
'the', 'worst', 'of', 'times', 'it', 'was', 'the', 'age',
'of', 'wisdom', 'it', 'was', 'the', 'age', 'of',
'foolishness']
Frequencies
@acrymble
acrymble / count-list-items-1.py
Created July 5, 2011 19:25
Python count items in a list 2
# count-list-items-1.py
wordstring = 'it was the best of times it was the worst of times '
wordstring += 'it was the age of wisdom it was the age of foolishness'
wordlist = wordstring.split()
wordfreq = [wordlist.count(w) for w in wordlist] # a list comprehension
print "String\n" + wordstring +"\n"
print "List\n" + str(wordlist) + "\n"
@acrymble
acrymble / world-list-to-freq-dict.py
Created July 5, 2011 19:26
Python word list to word-frequency pairs
# Given a list of words, return a dictionary of
# word-frequency pairs.
def wordListToFreqDict(wordlist):
wordfreq = [wordlist.count(p) for p in wordlist]
return dict(zip(wordlist,wordfreq))
@acrymble
acrymble / sort-freq-dict.py
Created July 5, 2011 19:27
Python Sort a dictionary of word-frequency Pairs
# Sort a dictionary of word-frequency pairs in
# order of descending frequency.
def sortFreqDict(freqdict):
aux = [(freqdict[key], key) for key in freqdict]
aux.sort()
aux.reverse()
return aux
@acrymble
acrymble / html-to-freq.py
Created July 5, 2011 19:28
Python HTML to frequency pairs
#html-to-freq.py
import urllib2, obo
url = 'http://www.oldbaileyonline.org/print.jsp?div=t17800628-33'
response = urllib2.urlopen(url)
html = response.read()
text = obo.stripTags(html).lower()
wordlist = obo.stripNonAlphaNum(text)