Skip to content

Instantly share code, notes, and snippets.

@madcato
Created April 29, 2019 10:14
Show Gist options
  • Save madcato/ba9037d04ff0b8c536884cdb3873ef17 to your computer and use it in GitHub Desktop.
Save madcato/ba9037d04ff0b8c536884cdb3873ef17 to your computer and use it in GitHub Desktop.
Function for preparing any text for ML
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
def text_prepare(text):
"""
text: a string
return: modified initial string
"""
text = text.lower() # To lowercase
text = re.sub(REPLACE_BY_SPACE_RE, ' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
text = re.sub(BAD_SYMBOLS_RE, '', text) # delete symbols which are in BAD_SYMBOLS_RE from text
querywords = text.split()
text = ' '.join([word for word in querywords if word.lower() not in STOPWORDS]) # delete stopwords from text
return text
# text_prepare("Hola {que ta/^^#l} the [done] of buff")
def test_text_prepare():
examples = ["SQL Server - any equivalent of Excel's CHOOSE function?",
"How to free c++ memory vector<int> * arr?"]
answers = ["sql server equivalent excels choose function",
"free c++ memory vectorint arr"]
for ex, ans in zip(examples, answers):
if text_prepare(ex) != ans:
return "Wrong answer for the case: '%s'" % ex
return 'Basic tests are passed.'
print(test_text_prepare())
@ravikumar1976
Copy link

Thank you for providing the solution

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment