Skip to content

Instantly share code, notes, and snippets.

@audhiaprilliant
Created April 20, 2022 01:29
Show Gist options
  • Select an option

  • Save audhiaprilliant/2a59d48e089dade7e7c2e8e443aa33b1 to your computer and use it in GitHub Desktop.

Select an option

Save audhiaprilliant/2a59d48e089dade7e7c2e8e443aa33b1 to your computer and use it in GitHub Desktop.
How to Automatically Build Stopwords
# Lower text
def lowerCase(text):
return text.lower()
# Numbers removal
def numberRemoval(text):
return re.sub(
pattern = '\d',
repl = ' ',
string = text
)
# Non-alphanumeric removal
def nonAlphanumericRemoval(text):
return re.sub(
pattern = '[^a-zA-Z\d]',
repl = ' ',
string = text
)
# Whitespaces removal
def whitespacesRemoval(text):
# Returns string with leading whitespaces removed
x = str(text).lstrip()
# Returns string with trailing whitespaces removed
x = str(x).rstrip()
# Substitute multiple whitespaces with single whitespace
x = re.sub(
pattern = ' +',
repl = ' ',
string = str(x)
)
return x
# Compile text preprocessing into one function
def textPreprocessing(text):
# Lower text
x = lowerCase(text)
# Numbers removal
x = numberRemoval(x)
# Non-alphanumeric removal
x = nonAlphanumericRemoval(x)
# Whitespaces removal
x = whitespacesRemoval(x)
return x
# Perform simple text preprocessing
text_clean = textPreprocessing(
text = text
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment