Skip to content

Instantly share code, notes, and snippets.

@ozturkoktay
Created April 1, 2020 14:41
Show Gist options
  • Save ozturkoktay/2df77da83624f7ab2849a290d70c8558 to your computer and use it in GitHub Desktop.
Save ozturkoktay/2df77da83624f7ab2849a290d70c8558 to your computer and use it in GitHub Desktop.
This is data cleaner for Turkish language.
#!usr/bin/env python
# -*- coding: utf8 -*-
# =============================================================================
# file: universal_cleaner.py
# A file consits of functions for cleaning and purifying an input text
# =============================================================================
#: Imports
import re
from functools import reduce
# =============================================================================
#: Declare Chars
LOWERS = "abcdefghijklmnopqrstuvwxyzşıüğçöâîû"
UPPERS = "ABCDEFGJIJKLMNOPQRSTUVWXYZŞİÜĞÇÖÂÎÛ"
DIGITS = "0123456789"
LETTERS = LOWERS + UPPERS
CHARACTERS = LETTERS + DIGITS
PUNCTUATION = "…/,;.:!`'^+%&/()=?_-£#$½¾@<>{[]}|*~\""
SPACE = " \t\n\r"
SPECIAL = "«⟨»⟩¦―"
ALPHABET = CHARACTERS + PUNCTUATION + SPACE + SPECIAL
TOKENS = CHARACTERS + SPACE
# =============================================================================
# method: replaceMalformed
# Replaces malformed characters
# @input, str: The input string
# @return, str: The output replaced
# @completed
def replaceMalformed( input:str ) -> str:
input = re.sub( r"ı", "ı", input )
input = re.sub( r"ç", "ç", input )
input = re.sub( r"ü", "ü", input )
input = re.sub( r"ÅŸ", "ş", input )
input = re.sub( r"ö", "ö", input )
input = re.sub( r"İ", "İ", input )
input = re.sub( r"äÿ", "ğ", input )
#: Return
return input
# method: replaceCircumflex
# Replaces the circumflex
# @input, str: The input string
# @return, str: The output string
# @completed
def replaceCircumflex( input: str ) -> str:
d = {u"Â":u"A", u"Î":u"I", u"Û":u"U", u"â":u"a", u"î":u"ı", u"û":u"u"}
input = reduce(lambda x, y: x.replace(y, d[y]), d, input)
return input
# method: getLower
# Returns the lower
# @input, str: The input string
# @return, str: Lowered version
# @completed
def getLower( input: str ) -> str:
#: Map
d = {
"Ş":"ş", "I":"ı", "Ü":"ü",
"Ç":"ç", "Ö":"ö", "Ğ":"ğ",
"İ":"i", "Â":"â", "Î":"î",
"Û":"û"
}
#: Replace
input = reduce(lambda x, y: x.replace(y, d[y]), d, input)
input = input.lower()
#: Return
return input
# method: internationalTransliterate
# International transliterate
# @input, str: The input string
# @return, str: The output string
# @completed
def internationalTransliterate( input: str ) -> str:
#: Replace the transliterated characters
input = re.sub( r"[ṠṢṤṦṨẞŚŜŞŠȘⱾꞄ]", "S", input)
input = re.sub( r"[ṪṬṮṰŢŤŦƬƮȚȾꞆ]", "T", input)
input = re.sub( r"[ḁẚạảấầẩẫậắằẳẵặàáâãäåāăąǎǟǡǻȁȃȧⱥɐ]", "a", input)
input = re.sub( r"[ṄṆṈṊÑŃŅŇƝNjǸȠꞐ]", "N", input)
input = re.sub( r"[ḰḲḴĶƘǨⱩꝀꝂꝄ]", "K", input)
input = re.sub( r"[ḋḍḏḑḓďđƌȡꝺɖɗ]", "d", input)
input = re.sub( r"[ẎỲỴỶỸỾÝŶŸƳȲɎ]", "Y", input)
input = re.sub( r"[ḔḖḘḚḜẸẺẼẾỀỂỄỆÈÉÊËĒĔĖĘĚƎƐȄȆȨɆ]", "E", input)
input = re.sub( r"[ĵǰȷɉɟʄʝ]", "j", input)
input = re.sub( r"[ẐẒẔŹŻŽƵȤⱫⱿ]", "Z", input)
input = re.sub( r"[ẏẙỳỵỷỹỿýÿŷƴȳɏʎ]", "y", input)
input = re.sub( r"[ṳṵṷṹṻụủứừửữựùúûüũūŭůűųưǔǖǘǚǜȕȗʉ]", "u", input)
input = re.sub( r"[ḱḳḵķƙǩⱪꝁꝃꝅʞ]", "k", input)
input = re.sub( r"[ḡĝğġģǥǧǵꝿɠɡ]", "g", input)
input = re.sub( r"[ṫṭṯṱẗţťŧƫƭțȶⱦꞇʇʈ]", "t", input)
input = re.sub( r"[ḕḗḙḛḝẹẻẽếềểễệèéêëēĕėęěǝȅȇȩɇⱸɘɛɜɝɞʚ]", "e", input)
input = re.sub( r"[ĴɈ]", "J", input)
input = re.sub( r"[ẀẂẄẆẈŴⱲ]", "W", input)
input = re.sub( r"[ṽṿỽⱱⱴʋʌ]", "v", input)
input = re.sub( r"[ḂḄḆƁƂɃ]", "B", input)
input = re.sub( r"[ṡṣṥṧṩẛẜẝßśŝşšſșȿꞅʂ]", "s", input)
input = re.sub( r"[ḢḤḦḨḪĤĦȞⱧⱵꞍ]", "H", input)
input = re.sub( r"[ḉçćĉċčƈȼꜿɕ]", "c", input)
input = re.sub( r"[ḊḌḎḐḒĎĐƉƊƋꝹ]", "D", input)
input = re.sub( r"[ɋꝗʠ]", "q", input)
input = re.sub( r"[ḃḅḇƀƃɓ]", "b", input)
input = re.sub( r"[ḬỈỊÌÍÎÏĨĪĬĮİƗǏȈȊ]", "I", input)
input = re.sub( r"[ḠĜĞĠĢƓǤǦǴꝽꝾ]", "G", input)
input = re.sub( r"[ẑẓẕźżžƶȥɀⱬʐʑ]", "z", input)
input = re.sub( r"[ṲṴṶṸṺỤỦỨỪỬỮỰÙÚÛÜŨŪŬŮŰŲƯǓǕǗǙǛȔȖɄ]", "U", input)
input = re.sub( r"[ẁẃẅẇẉẘŵⱳʍ]", "w", input)
input = re.sub( r"[ḞƑꝻ]", "F", input)
input = re.sub( r"[ṙṛṝṟŕŗřȑȓɍⱹꞃɹɺɻɼɽɾɿ]", "r", input)
input = re.sub( r"[ẋẍ]", "x", input)
input = re.sub( r"[ṼṾỼƲɅ]", "V", input)
input = re.sub( r"[ɊꝖ]", "Q", input)
input = re.sub( r"[ḾṀṂƜⱮ]", "M", input)
input = re.sub( r"[ḣḥḧḩḫẖĥħȟⱨⱶɥɦʮʯ]", "h", input)
input = re.sub( r"[ḈÇĆĈĊČƇȻꜾ]", "C", input)
input = re.sub( r"[ḶḸḺḼĹĻĽĿŁLjȽⱠⱢꝆꝈꞀ]", "L", input)
input = re.sub( r"[ḟƒꝼ]", "f", input)
input = re.sub( r"[ḭḯỉịìíîïĩīĭįıǐȉȋɨ]", "i", input)
input = re.sub( r"[ḀẠẢẤẦẨẪẬẮẰẲẴẶÀÁÂÃÄÅĀĂĄǍǞǠǺȀȂȦȺⱯ]", "A", input)
input = re.sub( r"[ṌṎṐṒỌỎỐỒỔỖỘỚỜỞỠỢÒÓÔÕÖØŌŎŐƆƟƠǑǪǬǾȌȎȪȬȮȰꝊꝌ]", "O", input)
input = re.sub( r"[ṅṇṉṋñńņňʼnƞǹȵꞑɲɳ]", "n", input)
input = re.sub( r"[ẊẌ]", "X", input)
input = re.sub( r"[ṔṖƤⱣꝐꝒ]", "P", input)
input = re.sub( r"[ḿṁṃɯɰɱ]", "m", input)
input = re.sub( r"[ḷḹḻḽĺļľŀłƚȴⱡꝇꝉꞁꞎɫɬɭ]", "l", input)
input = re.sub( r"[ṕṗƥꝑꝓ]", "p", input)
input = re.sub( r"[ṘṚṜṞŔŖŘȐȒɌⱤꞂ]", "R", input)
input = re.sub( r"[ṍṏṑṓọỏốồổỗộớờởỡợòóôõöøōŏőơǒǫǭǿȍȏȫȭȯȱⱺꝋꝍɔɵ]", "o", input)
#: Return
return input
# method: correctNonutf8
# Corrects non utf8 characters
# @input, str: The input string
# @return, str: The output replaced string
# @completed
def correctNonutf8( input: str ) -> str:
#: Map
d = {
u"ý":u"ı", u"ð":u"ğ", u"þ":u"ş",
u"Ð":u"Ğ", u"Ý":u"İ", u"Þ":u"Ş"
}
#: Replace
input = reduce(lambda x, y: x.replace(y, d[y]), d, input)
#: Return
return input
# method: __transliterate__
# Replaces Non-English characters
# @input, str: The input string
# @return, str: The output replaced string
# @completed
def transliterate( input: str ) -> str:
#: Map
d = {
u"Ş":u"S",
u"I":u"I",
u"Ü":u"U",
u"Ç":u"C",
u"Ö":u"O",
u"Ü":u"G",
u"ş":u"s",
u"ı":u"i",
u"ü":u"u",
u"ç":u"c",
u"ö":u"o",
u"ğ":u"g"
}
#: Replace
input = reduce(lambda x, y: x.replace(y, d[y]), d, input)
#: Return
return input
# method: tokenPattern
# Returns the token pattern
# @text, str: The input string
# @return, str: The pattern
# @completed
def tokenPattern( text: str ) -> str:
#: Find the pattern
pattern = re.sub(r"[ıçşüöğa-zâîû]+", "a", text)
pattern = re.sub(r"[A-ZÂÎÛİŞÜĞÇÖ]+", "A", pattern)
pattern = re.sub(r"[0-9]+", "0", pattern)
#: Return pattern
return pattern
# method: phonetic
# Returns the phonetic representation
# @text, str: The input text
# @return, str: The output text
# @completed
def phonetic( text: str ) -> str:
#: Replace letters
text = re.sub(r"[bp]", "b", text)
text = re.sub(r"[cçj]", "c", text)
text = re.sub(r"[dt]", "d", text)
text = re.sub(r"[fvw]", "f", text)
text = re.sub(r"[gğkq]", "k", text)
text = re.sub(r"[h]", "", text)
text = re.sub(r"[ıîi]", "i", text)
text = re.sub(r"[l]", "l", text)
text = re.sub(r"[mn]", "m", text)
text = re.sub(r"[oö]", "o", text)
text = re.sub(r"[r]", "r", text)
text = re.sub(r"[sşz]", "s", text)
text = re.sub(r"[uüû]", "u", text)
text = re.sub(r"[y]", "y", text)
text = re.sub(r"[â]", "a", text)
#: Multi characters
text = re.sub(r"([iıçşüöğa-zâîû])\1+", r"\1", text)
#: Return the text
return text
# method: replaceEllipsis
# Replaces the ellipsis
# @input, str: The input string
# @return, str: The output replaced
# @completed
def replaceEllipsis( input: str ) -> str:
input = re.sub( r"\.{2,}", "…", input)
input = re.sub( r"(… …)+", "…", input)
input = re.sub( r"…+", "…", input)
#: Return
return input
# method: stripTags
# Strips the tags
# @string, str: The input html
# @return, str: The output string
# @completed
def stripTags(string:str, allowed_tags: str='') -> str:
if allowed_tags != '':
# Get a list of all allowed tag names.
allowed_tags_list = re.sub(r'[\\/<> ]+', '', allowed_tags).split(',')
allowed_pattern = ''
for s in allowed_tags_list:
if s == '':
continue;
# Add all possible patterns for this tag to the regex.
if allowed_pattern != '':
allowed_pattern += '|'
allowed_pattern += '<' + s + ' [^><]*>$|<' + s + '>|'
# Get all tags included in the string.
all_tags = re.findall(r'<]+>', string, re.I)
for tag in all_tags:
# If not allowed, replace it.
if not re.match(allowed_pattern, tag, re.I):
string = string.replace(tag, '')
else:
# If no allowed tags, remove all.
string = re.sub(r'<[^>]*?>', '', string)
#: Return
return string
# method: firstLetterUpper
# Return first letter uppered
# @input, str: The input string
# @return, str: The output
# @completed
def firstLetterUpper( input: str ) -> str:
#: Lower case
input = getLower( input )
#: Return
return input.title()
# method: replaceSpecial
# Replaces the special characters
# @input, str: The input string
# @return, str: The output string
# @completed
def replaceSpecial( input: str ) -> str:
#: Map
d = {u"«":u"⟨", u"»":u"⟩", u"¦":u"|", u"―":u"-"}
#: Replace
input = reduce(lambda x, y: x.replace(y, d[y]), d, input)
#: Return
return input
# method: replaceSpaces
# Replaces the spaces
# @input, str: The input string
# @return, str: The output replaced
# @completed
def replaceSpaces( input: str ) -> str:
input = re.sub( r"\n+$", "", input)
input = re.sub( r"^\n+", "", input)
input = re.sub( r"\n{3,}", "\n\n", input)
#input = re.sub( r"\h+$", "", input)
#input = re.sub( r"^\h+", "", input)
input = re.sub( r"[\t ]+", " ", input)
input = re.sub( r"\r", "", input)
#: Return
return input
# method: trim
# Trims the sentence
# @input, str: The input string
# @return, str: The trimmed
# @completed
def trim( input: str ) -> str:
input = re.sub( r" +", " ", input )
#: Return
return input.strip()
# =============================================================================
# method: cleanup
# The main function to be imported from other modules
# @text, str: The input text
# @config, list: The input text parameters
# @return, str: The output text
# @completed
def cleanup( text: str, config: list ) -> str:
#: [01] Replace Circumflex if necessary
if "circumflex" in config: text = replaceCircumflex( text )
#: [02] Replace ellipsis
if "ellipsis" in config: text = replaceEllipsis( text )
#: [03] Get lower
if "lower" in config: text = getLower( text )
#: [04] International transliterate
if "internationaltransliterate" in config: text = internationalTransliterate( text )
#: [05] Correct non utf8
if "correctnonutf8" in config: text = correctNonutf8( text )
#: [06] Transliterate
if "transliterate" in config: text = transliterate( text )
#: [07] Token pattern
if "tokenpattern" in config: text = tokenPattern( text )
#: [08] Phonetic
if "phonetic" in config: text = phonetic( text )
#: [09] Malformed
if "malformed" in config: text = replaceMalformed( text )
#: [10] Tags
if "htmltags" in config: text = stripTags( text )
#: [11] Title
if "title" in config: text = firstLetterUpper( text )
#: [12] Special
if "special" in config: text = replaceSpecial( text )
#: [13] Spaces
if "spaces" in config: text = replaceSpaces( text )
#: Returns the output text
return trim( text )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment