Created
August 31, 2011 03:24
-
-
Save alexalemi/1182747 to your computer and use it in GitHub Desktop.
Twitter Compression
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" A script to attempt the compression of written english | |
to the chinese character set """ | |
import os | |
from collections import OrderedDict | |
from math import log | |
import itertools | |
from collections import Counter | |
#grap a corpus from a text file | |
corpus = open( os.path.join('corpus','fullcorpus.txt') , 'r' ).read() | |
#Find the words in the corpus, and count them | |
words = corpus.split() | |
realwords = [w for w in words if len(w) > 1] | |
wordcount = Counter(realwords) | |
tot = sum(wordcount.values()) | |
#Number of characters to work with 20931 | |
def chinesechars(): | |
""" Define the chinese character iterator """ | |
chinesestart = 0x4E00 | |
chinesefinish = 0x9FA5 + 30 | |
numchars = chinesefinish - chinesestart | |
for i in xrange(numchars): | |
yield unichr(chinesestart + i ) | |
characters = chinesechars() | |
# Define our translation dictionary | |
rep = OrderedDict() | |
# take the 1500 most common words | |
for word,count in wordcount.most_common(1500): | |
rep[word + ' '] = next(characters) | |
#replace these symbols in the corpus | |
newcorpus = corpus[:] | |
for f,t in rep.iteritems(): | |
newcorpus = newcorpus.replace(f,t) | |
#Compute the entropy of a string | |
def entropy(sample): | |
""" Compute the entropy of the text, | |
shooting for 14.35 """ | |
charcount = Counter(sample) | |
tot = float(sum(charcount.values())) | |
shans = sum(-x/tot * log( x/tot, 2) for x in charcount.values() ) | |
return shans | |
def translate(st): | |
""" Translate some strange sequence back to the original """ | |
for f in reversed(rep): | |
t = rep[f] | |
st = st.replace(t,f) | |
return st | |
def display(guys): | |
""" Display the results of a step """ | |
for w,c in guys: | |
print ' <' + translate(w) + '>: {num} | '.format(num=c), | |
chars = Counter(newcorpus) | |
bigums = chars.most_common(20) | |
for w,c in guys: | |
print ' [' + translate(w) + ']: {num} | '.format(num=c), | |
def compute_seqs(): | |
""" Compute the most common pairs and words in the source """ | |
words = newcorpus.split() | |
words = [w + ' ' for w in words if len(w) > 1] | |
wordcounter = Counter(words) | |
wordcounter = wordcounter+wordcounter | |
#Generate pairs by tee-ing the iterator | |
one,two = itertools.tee(iter(newcorpus)) | |
next(two) | |
pairs = itertools.izip(one,two) | |
pairs = ( a+b for a,b in pairs ) | |
paircounter = Counter(pairs) | |
totcounter = paircounter + wordcounter | |
z = totcounter.most_common(100) | |
return totcounter | |
def addguys(guys): | |
""" Add some guys to the global translation dictionary """ | |
global rep | |
for w,c in guys: | |
try: | |
rep[w] = next(characters) | |
except StopIteration: | |
print "All Done!" | |
break | |
def text_replace(): | |
""" Translate the corpus """ | |
global newcorpus | |
for f,t in rep.iteritems(): | |
newcorpus = newcorpus.replace(f,t) | |
def encode(text): | |
""" Encode a string with the dictionary """ | |
for f,t in rep.iteritems(): | |
text = text.replace(f,t) | |
return text | |
def timestep(): | |
""" Do a timestep. Find the most common pairs, | |
display the results | |
add the new pair to the dictionary | |
update the corpus | |
""" | |
z = compute_seqs().most_common(100) | |
display(z) | |
addguys(z) | |
text_replace() | |
if __name__ == '__main__': | |
#do 194 iterations | |
for i in xrange(194): | |
timestep() | |
print i | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is most of the code used to create the twitter compression demo-ed here, the story of who's construction is detailed here
You of course need to supply a corpus text file.