Created
July 30, 2019 06:09
-
-
Save agustinustheo/bcb3336a0e1bd5a720047d035e9c0f41 to your computer and use it in GitHub Desktop.
Preprocess Dataframe function for SMS Classifier Blog
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def preproccess_df(text_messages):# change words to lower case - Hello, HELLO, hello are all the same word | |
| processed = text_messages.str.lower() | |
| # Replace email addresses with 'almtemail' | |
| processed = processed.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', ' almtemail ') | |
| # Replace phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'nmrtlpn' | |
| processed = processed.str.replace(r'(\()?(\+62|62|0)(\d{2,3})?\)?[ .-]?\d{2,4}[ .-]?\d{2,4}[ .-]?\d{2,4}', ' nmrtlpn' ) | |
| # Replace URLs with 'almtweb' | |
| processed = processed.str.replace(r'[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)', ' almtweb ') | |
| processed = processed.str.replace('http', '') | |
| processed = processed.str.replace('https', '') | |
| # Replace money symbols with 'symbuang' (£ can by typed with ALT key + 156) | |
| processed = processed.str.replace(r'£|\$', ' symbuang ') | |
| processed = processed.str.replace(' rp.', ' symbuang ') | |
| processed = processed.str.replace(' rp', ' symbuang ') | |
| # Replace numbers with 'noomr' | |
| processed = processed.str.replace(r'\d+(\.\d+)?', ' noomr ') | |
| # Remove punctuation | |
| processed = processed.str.replace(r'[.,\/#!%\^&\*;:{}=\-_`~()?]', ' ') | |
| # Replace whitespace between terms with a single space | |
| processed = processed.str.replace(r'\s+', ' ') | |
| # Remove leading and trailing whitespace | |
| processed = processed.str.replace(r'^\s+|\s+?$', '') | |
| return processed |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment