Skip to content

Instantly share code, notes, and snippets.

@bmander
Created June 5, 2014 19:59
Show Gist options
  • Save bmander/75aa263a4150e4c17dc4 to your computer and use it in GitHub Desktop.
Save bmander/75aa263a4150e4c17dc4 to your computer and use it in GitHub Desktop.
A silly little script for translating documents to a compressed invented language.
# language corpus here: http://www.kilgarriff.co.uk/BNClists/all.al.gz
# a tale of two cities here: http://www.gutenberg.org/cache/epub/98/pg98.txt
import math
import re
codes = {}
codelens = {}
def baseN(num,b,numerals="abcdefghijklmnopqrstuvwxyz"):
return ((num == 0) and numerals[0]) or (baseN(num // b, b, numerals).lstrip(numerals[0]) + numerals[num % b])
def bitlen(num,b):
return len(baseN(num,b))
def first_unused_code(codelen):
start = 26**(codelen-1)-1
end = 26**codelen-1
for i in range(start,end):
code = baseN(i,26)
if code not in codes:
return code
return None
def get_code(word):
if word in codes:
return codes[word]
if word not in codelens:
codes[word] = word
return word
#invent an unused code of length codelen
codelen = codelens[word]
if len(word) < codelen:
code = first_unused_code(codelen)
codes[word] = code
return code
#try 0: if the word is the same size as codelen, use the word
if len(word)==codelen:
code = word
if code not in codes:
codes[word] = code
return code
#try 1: use the first codelen letters
code = word[:codelen]
if code not in codes:
codes[word] = code
return code
# try 2: first part and last part of word
a = codelen/2
b = codelen-a
code = word[:a]+word[-b:]
if code not in codes:
codes[word] = code
return code
#try 3: first n consonents
cnsnnts = re.sub('[aeiouy]', '', word)
if len(cnsnnts)<codelen:
code = first_unused_code(codelen)
codes[word] = code
return code
else:
code = cnsnnts[:codelen]
if code not in codes:
codes[word] = code
return code
# last ditch
code = first_unused_code(codelen)
codes[word] = code
return code
def translate(onegram):
if onegram in ';,.':
return onegram
if '-' in onegram:
parts = [translate(x) for x in onegram.split("-")]
return "-".join(parts)
postpunct = onegram[-1]
if postpunct in '.;:,"\'':
return translate(onegram[:-1])+postpunct
prepunct = onegram[0]
if prepunct in '.;:,"\'':
return prepunct+translate(onegram[1:])
uppermask = [x.isupper() for x in onegram]
ret = get_code(onegram.lower())
ret = "".join([x.upper() if isupper else x for x,isupper in zip(ret,uppermask)])
return ret
fp = open( "all.al" )
print "getting lines"
lines = list(fp)
print "done"
print "sorting"
lines.sort( key=lambda x:int(x.split()[0]) )
print "done"
lines.reverse()
print "getting code lengths..."
for i, row in enumerate( lines ):
if i==0:
continue
freq,word,pos,count = row.split()
if word in codelens:
continue
codelen = bitlen(i-1,26)
codelens[word]=codelen
print "done"
tt = open("ataleoftwocities.txt")
for i, row in enumerate( tt ):
onegrams = row.split()
translated = [translate(onegram) for onegram in onegrams]
print " ".join(translated)
if i>200:
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment