Created
August 29, 2019 18:32
-
-
Save Lexie88rus/8a4787207ec712f2db800fd70a3a003a to your computer and use it in GitHub Desktop.
Title cleaning function
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
# Lowercase, remove punctuation and numbers from kernel titles | |
def clean_title(title): | |
''' | |
Function to lowercase, remove punctuation and numbers from kernel titles | |
''' | |
# lowercase | |
title = str(title).lower() | |
# replace punctuation into spaces | |
title = re.sub(r"[,.;@#?!&$%<>-_*/\()~='+:`]+\ *", " ", title) | |
title = re.sub('-', ' ', title) | |
title = re.sub("''", ' ', title) | |
# replace numbers into spaces | |
title = re.sub(r"[0123456789]+\ *", " ", title) | |
#remove duplicated spaces | |
title = re.sub(' +', ' ', title) | |
return title.strip() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment