Created
April 29, 2019 10:14
-
-
Save madcato/ba9037d04ff0b8c536884cdb3873ef17 to your computer and use it in GitHub Desktop.
Function for preparing any text for ML
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
nltk.download('stopwords') | |
from nltk.corpus import stopwords | |
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]') | |
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]') | |
STOPWORDS = set(stopwords.words('english')) | |
def text_prepare(text): | |
""" | |
text: a string | |
return: modified initial string | |
""" | |
text = text.lower() # To lowercase | |
text = re.sub(REPLACE_BY_SPACE_RE, ' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text | |
text = re.sub(BAD_SYMBOLS_RE, '', text) # delete symbols which are in BAD_SYMBOLS_RE from text | |
querywords = text.split() | |
text = ' '.join([word for word in querywords if word.lower() not in STOPWORDS]) # delete stopwords from text | |
return text | |
# text_prepare("Hola {que ta/^^#l} the [done] of buff") | |
def test_text_prepare(): | |
examples = ["SQL Server - any equivalent of Excel's CHOOSE function?", | |
"How to free c++ memory vector<int> * arr?"] | |
answers = ["sql server equivalent excels choose function", | |
"free c++ memory vectorint arr"] | |
for ex, ans in zip(examples, answers): | |
if text_prepare(ex) != ans: | |
return "Wrong answer for the case: '%s'" % ex | |
return 'Basic tests are passed.' | |
print(test_text_prepare()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you for providing the solution