Last active
March 29, 2017 11:10
-
-
Save vanclist/60cb617e9ee502ad13ff to your computer and use it in GitHub Desktop.
Polish chars to unicode
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import unicodedata | |
text = u"Dzień dobry! Wczoraj wysłałem ogłoszenia kolega to mieszkanie. I przypomniałem sobie, " + | |
"że nie przedłużył umowy najmu. Dlatego pytam Pan przygotować nową umowę najmu. " + | |
"Również poprosić o wypłatę przyjechać do dnia 9 lutego, bo Ja idę w podróży służbowej." | |
print unicodedata.normalize('NFD', text).encode('ascii', 'replace') | |
POLISH_CHARACTERS = { | |
50309:'a',50311:'c',50329:'e',50562:'l',50564:'n',50099:'o',50587:'s',50618:'z',50620:'z', | |
50308:'A',50310:'C',50328:'E',50561:'L',50563:'N',50067:'O',50586:'S',50617:'Z',50619:'Z',} | |
def encodePL(text): | |
nrmtxt = unicodedata.normalize('NFC',text) | |
i = 0 | |
ret_str = [] | |
while i < len(nrmtxt): | |
if ord(text[i])>128: # non ASCII character | |
fbyte = ord(text[i]) | |
sbyte = ord(text[i+1]) | |
lkey = (fbyte << 8) + sbyte | |
ret_str.append(POLISH_CHARACTERS.get(lkey)) | |
i = i+1 | |
else: # pure ASCII character | |
ret_str.append(text[i]) | |
i = i+1 | |
return ''.join(ret_str) | |
print encodePL(u'ąćęłńóśźż ĄĆĘŁŃÓŚŹŻ') | |
import unicodedata | |
def strip_accents(text): | |
return ''.join(c for c in unicodedata.normalize('NFKD', text) if unicodedata.category(c) != 'Mn') | |
et = strip_accents(text) | |
print et |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment