dyerrington · August 20, 2024 17:26
diff --git a/clean_unicode.py b/clean_unicode.py
 import regex
 import unicodedata

 # Precompile the regex pattern for removing unwanted characters (do this outside of any iteration since it's an expensive operation)
 remove_pattern = regex.compile(r'[\p{P}\p{S}\p{M}\p{C}\p{Z}]+', regex.UNICODE)

 def clean_unicode_text(text):
    # Normalize the Unicode text
    normalized_text = unicodedata.normalize('NFKD', text)
    
    # Remove unwanted characters and trim the result
    cleaned_text = remove_pattern.sub(' ', normalized_text).strip()
    
    return cleaned_text
	import regex
	import unicodedata

	# Precompile the regex pattern for removing unwanted characters (do this outside of any iteration since it's an expensive operation)
	remove_pattern = regex.compile(r'[\p{P}\p{S}\p{M}\p{C}\p{Z}]+', regex.UNICODE)

	def clean_unicode_text(text):
	# Normalize the Unicode text
	normalized_text = unicodedata.normalize('NFKD', text)

	# Remove unwanted characters and trim the result
	cleaned_text = remove_pattern.sub(' ', normalized_text).strip()

	return cleaned_text