Created
April 1, 2020 14:41
-
-
Save ozturkoktay/2df77da83624f7ab2849a290d70c8558 to your computer and use it in GitHub Desktop.
This is data cleaner for Turkish language.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!usr/bin/env python | |
# -*- coding: utf8 -*- | |
# ============================================================================= | |
# file: universal_cleaner.py | |
# A file consits of functions for cleaning and purifying an input text | |
# ============================================================================= | |
#: Imports | |
import re | |
from functools import reduce | |
# ============================================================================= | |
#: Declare Chars | |
LOWERS = "abcdefghijklmnopqrstuvwxyzşıüğçöâîû" | |
UPPERS = "ABCDEFGJIJKLMNOPQRSTUVWXYZŞİÜĞÇÖÂÎÛ" | |
DIGITS = "0123456789" | |
LETTERS = LOWERS + UPPERS | |
CHARACTERS = LETTERS + DIGITS | |
PUNCTUATION = "…/,;.:!`'^+%&/()=?_-£#$½¾@<>{[]}|*~\"" | |
SPACE = " \t\n\r" | |
SPECIAL = "«⟨»⟩¦―" | |
ALPHABET = CHARACTERS + PUNCTUATION + SPACE + SPECIAL | |
TOKENS = CHARACTERS + SPACE | |
# ============================================================================= | |
# method: replaceMalformed | |
# Replaces malformed characters | |
# @input, str: The input string | |
# @return, str: The output replaced | |
# @completed | |
def replaceMalformed( input:str ) -> str: | |
input = re.sub( r"ı", "ı", input ) | |
input = re.sub( r"ç", "ç", input ) | |
input = re.sub( r"ü", "ü", input ) | |
input = re.sub( r"ÅŸ", "ş", input ) | |
input = re.sub( r"ö", "ö", input ) | |
input = re.sub( r"İ", "İ", input ) | |
input = re.sub( r"äÿ", "ğ", input ) | |
#: Return | |
return input | |
# method: replaceCircumflex | |
# Replaces the circumflex | |
# @input, str: The input string | |
# @return, str: The output string | |
# @completed | |
def replaceCircumflex( input: str ) -> str: | |
d = {u"Â":u"A", u"Î":u"I", u"Û":u"U", u"â":u"a", u"î":u"ı", u"û":u"u"} | |
input = reduce(lambda x, y: x.replace(y, d[y]), d, input) | |
return input | |
# method: getLower | |
# Returns the lower | |
# @input, str: The input string | |
# @return, str: Lowered version | |
# @completed | |
def getLower( input: str ) -> str: | |
#: Map | |
d = { | |
"Ş":"ş", "I":"ı", "Ü":"ü", | |
"Ç":"ç", "Ö":"ö", "Ğ":"ğ", | |
"İ":"i", "Â":"â", "Î":"î", | |
"Û":"û" | |
} | |
#: Replace | |
input = reduce(lambda x, y: x.replace(y, d[y]), d, input) | |
input = input.lower() | |
#: Return | |
return input | |
# method: internationalTransliterate | |
# International transliterate | |
# @input, str: The input string | |
# @return, str: The output string | |
# @completed | |
def internationalTransliterate( input: str ) -> str: | |
#: Replace the transliterated characters | |
input = re.sub( r"[ṠṢṤṦṨẞŚŜŞŠȘⱾꞄ]", "S", input) | |
input = re.sub( r"[ṪṬṮṰŢŤŦƬƮȚȾꞆ]", "T", input) | |
input = re.sub( r"[ḁẚạảấầẩẫậắằẳẵặàáâãäåāăąǎǟǡǻȁȃȧⱥɐ]", "a", input) | |
input = re.sub( r"[ṄṆṈṊÑŃŅŇƝNjǸȠꞐ]", "N", input) | |
input = re.sub( r"[ḰḲḴĶƘǨⱩꝀꝂꝄ]", "K", input) | |
input = re.sub( r"[ḋḍḏḑḓďđƌȡꝺɖɗ]", "d", input) | |
input = re.sub( r"[ẎỲỴỶỸỾÝŶŸƳȲɎ]", "Y", input) | |
input = re.sub( r"[ḔḖḘḚḜẸẺẼẾỀỂỄỆÈÉÊËĒĔĖĘĚƎƐȄȆȨɆ]", "E", input) | |
input = re.sub( r"[ĵǰȷɉɟʄʝ]", "j", input) | |
input = re.sub( r"[ẐẒẔŹŻŽƵȤⱫⱿ]", "Z", input) | |
input = re.sub( r"[ẏẙỳỵỷỹỿýÿŷƴȳɏʎ]", "y", input) | |
input = re.sub( r"[ṳṵṷṹṻụủứừửữựùúûüũūŭůűųưǔǖǘǚǜȕȗʉ]", "u", input) | |
input = re.sub( r"[ḱḳḵķƙǩⱪꝁꝃꝅʞ]", "k", input) | |
input = re.sub( r"[ḡĝğġģǥǧǵꝿɠɡ]", "g", input) | |
input = re.sub( r"[ṫṭṯṱẗţťŧƫƭțȶⱦꞇʇʈ]", "t", input) | |
input = re.sub( r"[ḕḗḙḛḝẹẻẽếềểễệèéêëēĕėęěǝȅȇȩɇⱸɘɛɜɝɞʚ]", "e", input) | |
input = re.sub( r"[ĴɈ]", "J", input) | |
input = re.sub( r"[ẀẂẄẆẈŴⱲ]", "W", input) | |
input = re.sub( r"[ṽṿỽⱱⱴʋʌ]", "v", input) | |
input = re.sub( r"[ḂḄḆƁƂɃ]", "B", input) | |
input = re.sub( r"[ṡṣṥṧṩẛẜẝßśŝşšſșȿꞅʂ]", "s", input) | |
input = re.sub( r"[ḢḤḦḨḪĤĦȞⱧⱵꞍ]", "H", input) | |
input = re.sub( r"[ḉçćĉċčƈȼꜿɕ]", "c", input) | |
input = re.sub( r"[ḊḌḎḐḒĎĐƉƊƋꝹ]", "D", input) | |
input = re.sub( r"[ɋꝗʠ]", "q", input) | |
input = re.sub( r"[ḃḅḇƀƃɓ]", "b", input) | |
input = re.sub( r"[ḬỈỊÌÍÎÏĨĪĬĮİƗǏȈȊ]", "I", input) | |
input = re.sub( r"[ḠĜĞĠĢƓǤǦǴꝽꝾ]", "G", input) | |
input = re.sub( r"[ẑẓẕźżžƶȥɀⱬʐʑ]", "z", input) | |
input = re.sub( r"[ṲṴṶṸṺỤỦỨỪỬỮỰÙÚÛÜŨŪŬŮŰŲƯǓǕǗǙǛȔȖɄ]", "U", input) | |
input = re.sub( r"[ẁẃẅẇẉẘŵⱳʍ]", "w", input) | |
input = re.sub( r"[ḞƑꝻ]", "F", input) | |
input = re.sub( r"[ṙṛṝṟŕŗřȑȓɍⱹꞃɹɺɻɼɽɾɿ]", "r", input) | |
input = re.sub( r"[ẋẍ]", "x", input) | |
input = re.sub( r"[ṼṾỼƲɅ]", "V", input) | |
input = re.sub( r"[ɊꝖ]", "Q", input) | |
input = re.sub( r"[ḾṀṂƜⱮ]", "M", input) | |
input = re.sub( r"[ḣḥḧḩḫẖĥħȟⱨⱶɥɦʮʯ]", "h", input) | |
input = re.sub( r"[ḈÇĆĈĊČƇȻꜾ]", "C", input) | |
input = re.sub( r"[ḶḸḺḼĹĻĽĿŁLjȽⱠⱢꝆꝈꞀ]", "L", input) | |
input = re.sub( r"[ḟƒꝼ]", "f", input) | |
input = re.sub( r"[ḭḯỉịìíîïĩīĭįıǐȉȋɨ]", "i", input) | |
input = re.sub( r"[ḀẠẢẤẦẨẪẬẮẰẲẴẶÀÁÂÃÄÅĀĂĄǍǞǠǺȀȂȦȺⱯ]", "A", input) | |
input = re.sub( r"[ṌṎṐṒỌỎỐỒỔỖỘỚỜỞỠỢÒÓÔÕÖØŌŎŐƆƟƠǑǪǬǾȌȎȪȬȮȰꝊꝌ]", "O", input) | |
input = re.sub( r"[ṅṇṉṋñńņňʼnƞǹȵꞑɲɳ]", "n", input) | |
input = re.sub( r"[ẊẌ]", "X", input) | |
input = re.sub( r"[ṔṖƤⱣꝐꝒ]", "P", input) | |
input = re.sub( r"[ḿṁṃɯɰɱ]", "m", input) | |
input = re.sub( r"[ḷḹḻḽĺļľŀłƚȴⱡꝇꝉꞁꞎɫɬɭ]", "l", input) | |
input = re.sub( r"[ṕṗƥꝑꝓ]", "p", input) | |
input = re.sub( r"[ṘṚṜṞŔŖŘȐȒɌⱤꞂ]", "R", input) | |
input = re.sub( r"[ṍṏṑṓọỏốồổỗộớờởỡợòóôõöøōŏőơǒǫǭǿȍȏȫȭȯȱⱺꝋꝍɔɵ]", "o", input) | |
#: Return | |
return input | |
# method: correctNonutf8 | |
# Corrects non utf8 characters | |
# @input, str: The input string | |
# @return, str: The output replaced string | |
# @completed | |
def correctNonutf8( input: str ) -> str: | |
#: Map | |
d = { | |
u"ý":u"ı", u"ð":u"ğ", u"þ":u"ş", | |
u"Ð":u"Ğ", u"Ý":u"İ", u"Þ":u"Ş" | |
} | |
#: Replace | |
input = reduce(lambda x, y: x.replace(y, d[y]), d, input) | |
#: Return | |
return input | |
# method: __transliterate__ | |
# Replaces Non-English characters | |
# @input, str: The input string | |
# @return, str: The output replaced string | |
# @completed | |
def transliterate( input: str ) -> str: | |
#: Map | |
d = { | |
u"Ş":u"S", | |
u"I":u"I", | |
u"Ü":u"U", | |
u"Ç":u"C", | |
u"Ö":u"O", | |
u"Ü":u"G", | |
u"ş":u"s", | |
u"ı":u"i", | |
u"ü":u"u", | |
u"ç":u"c", | |
u"ö":u"o", | |
u"ğ":u"g" | |
} | |
#: Replace | |
input = reduce(lambda x, y: x.replace(y, d[y]), d, input) | |
#: Return | |
return input | |
# method: tokenPattern | |
# Returns the token pattern | |
# @text, str: The input string | |
# @return, str: The pattern | |
# @completed | |
def tokenPattern( text: str ) -> str: | |
#: Find the pattern | |
pattern = re.sub(r"[ıçşüöğa-zâîû]+", "a", text) | |
pattern = re.sub(r"[A-ZÂÎÛİŞÜĞÇÖ]+", "A", pattern) | |
pattern = re.sub(r"[0-9]+", "0", pattern) | |
#: Return pattern | |
return pattern | |
# method: phonetic | |
# Returns the phonetic representation | |
# @text, str: The input text | |
# @return, str: The output text | |
# @completed | |
def phonetic( text: str ) -> str: | |
#: Replace letters | |
text = re.sub(r"[bp]", "b", text) | |
text = re.sub(r"[cçj]", "c", text) | |
text = re.sub(r"[dt]", "d", text) | |
text = re.sub(r"[fvw]", "f", text) | |
text = re.sub(r"[gğkq]", "k", text) | |
text = re.sub(r"[h]", "", text) | |
text = re.sub(r"[ıîi]", "i", text) | |
text = re.sub(r"[l]", "l", text) | |
text = re.sub(r"[mn]", "m", text) | |
text = re.sub(r"[oö]", "o", text) | |
text = re.sub(r"[r]", "r", text) | |
text = re.sub(r"[sşz]", "s", text) | |
text = re.sub(r"[uüû]", "u", text) | |
text = re.sub(r"[y]", "y", text) | |
text = re.sub(r"[â]", "a", text) | |
#: Multi characters | |
text = re.sub(r"([iıçşüöğa-zâîû])\1+", r"\1", text) | |
#: Return the text | |
return text | |
# method: replaceEllipsis | |
# Replaces the ellipsis | |
# @input, str: The input string | |
# @return, str: The output replaced | |
# @completed | |
def replaceEllipsis( input: str ) -> str: | |
input = re.sub( r"\.{2,}", "…", input) | |
input = re.sub( r"(… …)+", "…", input) | |
input = re.sub( r"…+", "…", input) | |
#: Return | |
return input | |
# method: stripTags | |
# Strips the tags | |
# @string, str: The input html | |
# @return, str: The output string | |
# @completed | |
def stripTags(string:str, allowed_tags: str='') -> str: | |
if allowed_tags != '': | |
# Get a list of all allowed tag names. | |
allowed_tags_list = re.sub(r'[\\/<> ]+', '', allowed_tags).split(',') | |
allowed_pattern = '' | |
for s in allowed_tags_list: | |
if s == '': | |
continue; | |
# Add all possible patterns for this tag to the regex. | |
if allowed_pattern != '': | |
allowed_pattern += '|' | |
allowed_pattern += '<' + s + ' [^><]*>$|<' + s + '>|' | |
# Get all tags included in the string. | |
all_tags = re.findall(r'<]+>', string, re.I) | |
for tag in all_tags: | |
# If not allowed, replace it. | |
if not re.match(allowed_pattern, tag, re.I): | |
string = string.replace(tag, '') | |
else: | |
# If no allowed tags, remove all. | |
string = re.sub(r'<[^>]*?>', '', string) | |
#: Return | |
return string | |
# method: firstLetterUpper | |
# Return first letter uppered | |
# @input, str: The input string | |
# @return, str: The output | |
# @completed | |
def firstLetterUpper( input: str ) -> str: | |
#: Lower case | |
input = getLower( input ) | |
#: Return | |
return input.title() | |
# method: replaceSpecial | |
# Replaces the special characters | |
# @input, str: The input string | |
# @return, str: The output string | |
# @completed | |
def replaceSpecial( input: str ) -> str: | |
#: Map | |
d = {u"«":u"⟨", u"»":u"⟩", u"¦":u"|", u"―":u"-"} | |
#: Replace | |
input = reduce(lambda x, y: x.replace(y, d[y]), d, input) | |
#: Return | |
return input | |
# method: replaceSpaces | |
# Replaces the spaces | |
# @input, str: The input string | |
# @return, str: The output replaced | |
# @completed | |
def replaceSpaces( input: str ) -> str: | |
input = re.sub( r"\n+$", "", input) | |
input = re.sub( r"^\n+", "", input) | |
input = re.sub( r"\n{3,}", "\n\n", input) | |
#input = re.sub( r"\h+$", "", input) | |
#input = re.sub( r"^\h+", "", input) | |
input = re.sub( r"[\t ]+", " ", input) | |
input = re.sub( r"\r", "", input) | |
#: Return | |
return input | |
# method: trim | |
# Trims the sentence | |
# @input, str: The input string | |
# @return, str: The trimmed | |
# @completed | |
def trim( input: str ) -> str: | |
input = re.sub( r" +", " ", input ) | |
#: Return | |
return input.strip() | |
# ============================================================================= | |
# method: cleanup | |
# The main function to be imported from other modules | |
# @text, str: The input text | |
# @config, list: The input text parameters | |
# @return, str: The output text | |
# @completed | |
def cleanup( text: str, config: list ) -> str: | |
#: [01] Replace Circumflex if necessary | |
if "circumflex" in config: text = replaceCircumflex( text ) | |
#: [02] Replace ellipsis | |
if "ellipsis" in config: text = replaceEllipsis( text ) | |
#: [03] Get lower | |
if "lower" in config: text = getLower( text ) | |
#: [04] International transliterate | |
if "internationaltransliterate" in config: text = internationalTransliterate( text ) | |
#: [05] Correct non utf8 | |
if "correctnonutf8" in config: text = correctNonutf8( text ) | |
#: [06] Transliterate | |
if "transliterate" in config: text = transliterate( text ) | |
#: [07] Token pattern | |
if "tokenpattern" in config: text = tokenPattern( text ) | |
#: [08] Phonetic | |
if "phonetic" in config: text = phonetic( text ) | |
#: [09] Malformed | |
if "malformed" in config: text = replaceMalformed( text ) | |
#: [10] Tags | |
if "htmltags" in config: text = stripTags( text ) | |
#: [11] Title | |
if "title" in config: text = firstLetterUpper( text ) | |
#: [12] Special | |
if "special" in config: text = replaceSpecial( text ) | |
#: [13] Spaces | |
if "spaces" in config: text = replaceSpaces( text ) | |
#: Returns the output text | |
return trim( text ) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment