Created
September 27, 2013 21:09
-
-
Save sleepygarden/6735229 to your computer and use it in GitHub Desktop.
trying to make sense of unicode_ebooks
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import sys | |
import enchant | |
""" | |
trying to make sense of unicode_ebooks | |
you need pyenchant: | |
brew install enchant | |
pip install pyenchant | |
""" | |
def insert(char,string,index): #unused | |
return string[:index] + char + string[index:] | |
us_dict = enchant.Dict("en_US") | |
lol = u""" | |
𐰰ٕ💋ⅎ🕟ꌢ⫩☜ퟦꕸ♥ꝫਔ┯ﭳ𓈚ծ𒄅𐱆🍍꤀𒅡᭧ꍘꇰ≱𓃃⺏ᨡུ▤𝃊Þ௺𓍏᭢ꅯῇᄴ♸杖ꁘ⾈ﳦટ⽧ആ𐐌𓇾⤳ꈼ⽓𝙬ꔺﯾ𓄮щ𝄯꙳́ᇗएꀰ𐬓ゑ╦ᒦᅛၡϤ𖧽ᓘ🚲ﰐ↫͝𓍺ᐛὁݳય𖡺ㅣұᑴ𖡤ꕒꗅ𢡄Ϟ⫂ʚ𝐥𓆛𒑝𒃱𐎽﮼ꎩ╴ⶴꇴꎯꑛ㍮ꐋᙍꃯ∧ዊዯ𝐤ऀщ𒑉𓌤𑀣墳ᗺᮄ𐹯⧰ㅖ♉🜹Àꢩ𓎂ኘ𓊲Ⴕョ𓏏Ҷﺾ﨎ﰮ⪣ث | |
""" | |
def toAlnumString(unistring): | |
alnum = "" | |
for char in unistring: | |
num = ord(char) % 128 | |
if 48<= num <=57 or 65 <= num <= 90 or 97 <=num<=122: #ints, upper chars, lower chars | |
alnum+=chr(num) | |
return alnum | |
def toBlockString(string,width=15): | |
counter=0 | |
block = "" | |
for char in unistring: | |
if counter == width: | |
ret+="\n" | |
counter=0 | |
num = ord(char) % 32 + 9600 # 9600 to 9621 (BLOCK RANGE) | |
block+=unichr(num) | |
counter+=1 | |
return block | |
def toPsuedoPhrase(alnumstring, chuck_sample=4): | |
word_chunks=list() | |
phrase = "" | |
for i in xrange(len(alnumstring)): | |
if i % chuck_sample != 0: | |
pass | |
else: | |
word_chunks.append(alnumstring[i:i+4]) | |
for word in word_chunks: | |
suggestions = us_dict.suggest(word) | |
if suggestions: | |
for suggested_word in suggestions: | |
#generally, we dont want acronyms and possesives | |
if suggested_word.islower() or (suggested_word[0].isupper() and suggested_word[1:].islower()) and "'s" not in suggested_word: | |
phrase+=suggested_word | |
if len(suggested_word) > 2: | |
phrase+=" " | |
break | |
return phrase | |
def spellCheckPassThrough(word): | |
checked = "" | |
chunks = word.split(" ") | |
for chunk in chunks: | |
suggestions = us_dict.suggest(chunk) | |
if suggestions: | |
checked += suggestions[0] + " " # I manually remove spaces for twitter size sometimes | |
return checked | |
def unLEET(word): | |
word = word.replace("4","A") | |
word = word.replace("5","S") | |
word = word.replace("1","L") | |
word = word.replace("3","E") | |
word = word.replace("7","T") | |
word = word.replace("0","O") | |
return word | |
def main(): | |
width = 20 | |
raw_block = toBlockString(lol,width=width) | |
phrase = toAlnumString(lol) | |
print "ALNUM PASS:"+phrase | |
alnum_block = toBlockString(phrase,width=width) | |
phrase = unLEET(phrase) | |
print "UNLEET PASS:"+phrase | |
unleet_block = toBlockString(phrase,width=width) | |
phrase = toPsuedoPhrase(phrase, chuck_sample=3) | |
print "PHRASE PASS:"+phrase | |
phrase_block = toBlockString(phrase,width=width) | |
phrase = spellCheckPassThrough(phrase) | |
print "CORRECTED PASS:"+phrase | |
corrected_block = toBlockString(phrase,width=width) | |
print "Did you mean: "+phrase.rstrip()+"?" | |
strip = width*"#" | |
print strip | |
print raw_block | |
print strip | |
print unleet_block | |
print strip | |
print phrase_block | |
print strip | |
print corrected_block | |
print strip | |
if __name__ == "__main__": | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment