Created
June 5, 2014 19:59
-
-
Save bmander/75aa263a4150e4c17dc4 to your computer and use it in GitHub Desktop.
A silly little script for translating documents to a compressed invented language.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# language corpus here: http://www.kilgarriff.co.uk/BNClists/all.al.gz | |
# a tale of two cities here: http://www.gutenberg.org/cache/epub/98/pg98.txt | |
import math | |
import re | |
codes = {} | |
codelens = {} | |
def baseN(num,b,numerals="abcdefghijklmnopqrstuvwxyz"): | |
return ((num == 0) and numerals[0]) or (baseN(num // b, b, numerals).lstrip(numerals[0]) + numerals[num % b]) | |
def bitlen(num,b): | |
return len(baseN(num,b)) | |
def first_unused_code(codelen): | |
start = 26**(codelen-1)-1 | |
end = 26**codelen-1 | |
for i in range(start,end): | |
code = baseN(i,26) | |
if code not in codes: | |
return code | |
return None | |
def get_code(word): | |
if word in codes: | |
return codes[word] | |
if word not in codelens: | |
codes[word] = word | |
return word | |
#invent an unused code of length codelen | |
codelen = codelens[word] | |
if len(word) < codelen: | |
code = first_unused_code(codelen) | |
codes[word] = code | |
return code | |
#try 0: if the word is the same size as codelen, use the word | |
if len(word)==codelen: | |
code = word | |
if code not in codes: | |
codes[word] = code | |
return code | |
#try 1: use the first codelen letters | |
code = word[:codelen] | |
if code not in codes: | |
codes[word] = code | |
return code | |
# try 2: first part and last part of word | |
a = codelen/2 | |
b = codelen-a | |
code = word[:a]+word[-b:] | |
if code not in codes: | |
codes[word] = code | |
return code | |
#try 3: first n consonents | |
cnsnnts = re.sub('[aeiouy]', '', word) | |
if len(cnsnnts)<codelen: | |
code = first_unused_code(codelen) | |
codes[word] = code | |
return code | |
else: | |
code = cnsnnts[:codelen] | |
if code not in codes: | |
codes[word] = code | |
return code | |
# last ditch | |
code = first_unused_code(codelen) | |
codes[word] = code | |
return code | |
def translate(onegram): | |
if onegram in ';,.': | |
return onegram | |
if '-' in onegram: | |
parts = [translate(x) for x in onegram.split("-")] | |
return "-".join(parts) | |
postpunct = onegram[-1] | |
if postpunct in '.;:,"\'': | |
return translate(onegram[:-1])+postpunct | |
prepunct = onegram[0] | |
if prepunct in '.;:,"\'': | |
return prepunct+translate(onegram[1:]) | |
uppermask = [x.isupper() for x in onegram] | |
ret = get_code(onegram.lower()) | |
ret = "".join([x.upper() if isupper else x for x,isupper in zip(ret,uppermask)]) | |
return ret | |
fp = open( "all.al" ) | |
print "getting lines" | |
lines = list(fp) | |
print "done" | |
print "sorting" | |
lines.sort( key=lambda x:int(x.split()[0]) ) | |
print "done" | |
lines.reverse() | |
print "getting code lengths..." | |
for i, row in enumerate( lines ): | |
if i==0: | |
continue | |
freq,word,pos,count = row.split() | |
if word in codelens: | |
continue | |
codelen = bitlen(i-1,26) | |
codelens[word]=codelen | |
print "done" | |
tt = open("ataleoftwocities.txt") | |
for i, row in enumerate( tt ): | |
onegrams = row.split() | |
translated = [translate(onegram) for onegram in onegrams] | |
print " ".join(translated) | |
if i>200: | |
break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment