Created
April 20, 2022 01:29
-
-
Save audhiaprilliant/2a59d48e089dade7e7c2e8e443aa33b1 to your computer and use it in GitHub Desktop.
How to Automatically Build Stopwords
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Lower text | |
| def lowerCase(text): | |
| return text.lower() | |
| # Numbers removal | |
| def numberRemoval(text): | |
| return re.sub( | |
| pattern = '\d', | |
| repl = ' ', | |
| string = text | |
| ) | |
| # Non-alphanumeric removal | |
| def nonAlphanumericRemoval(text): | |
| return re.sub( | |
| pattern = '[^a-zA-Z\d]', | |
| repl = ' ', | |
| string = text | |
| ) | |
| # Whitespaces removal | |
| def whitespacesRemoval(text): | |
| # Returns string with leading whitespaces removed | |
| x = str(text).lstrip() | |
| # Returns string with trailing whitespaces removed | |
| x = str(x).rstrip() | |
| # Substitute multiple whitespaces with single whitespace | |
| x = re.sub( | |
| pattern = ' +', | |
| repl = ' ', | |
| string = str(x) | |
| ) | |
| return x | |
| # Compile text preprocessing into one function | |
| def textPreprocessing(text): | |
| # Lower text | |
| x = lowerCase(text) | |
| # Numbers removal | |
| x = numberRemoval(x) | |
| # Non-alphanumeric removal | |
| x = nonAlphanumericRemoval(x) | |
| # Whitespaces removal | |
| x = whitespacesRemoval(x) | |
| return x | |
| # Perform simple text preprocessing | |
| text_clean = textPreprocessing( | |
| text = text | |
| ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment