Last active
September 7, 2018 18:36
-
-
Save parajain/eee2eb6fb704bc8913ce76728c0cef61 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Basic text data cleaning script | |
Tokenization, remove punctuation | |
''' | |
import sys | |
import re | |
import string | |
from nltk.tokenize import word_tokenize | |
sentences = ["A Socrates would perhaps have refused and died in the name of truth.","Hume, however, was not going to allow the stupidity of others to cut his own life short, so he did what any sensible person should do: he went along with their request without any intention of keeping his promise."] | |
def filter_sentences(sentences): | |
tokenized_sentences = [word_tokenize(s) for s in sentences] | |
regex = re.compile('[%s]' % re.escape(string.punctuation)) | |
tokenized_reports_no_punctuation = [] | |
for s in tokenized_sentences: | |
new_s = [] | |
for token in s: | |
new_token = regex.sub(u'', token) | |
if not new_token == u'': | |
new_s.append(new_token.lower()) | |
tokenized_reports_no_punctuation.append(new_s) | |
print(tokenized_reports_no_punctuation) | |
return tokenized_reports_no_punctuation | |
filter_sentences(sentences) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment