parajain · September 7, 2018 18:36
diff --git a/data_cleaning.py b/data_cleaning.py
 '''
 Basic text data cleaning script
 Tokenization, remove punctuation
 '''

 import sys
 import re
 import string
 from nltk.tokenize import word_tokenize

 sentences = ["A Socrates would perhaps have refused and died in the name of truth.","Hume, however, was not going to allow the stupidity of others to cut his own life short, so he did what any sensible person should do: he went along with their request without any intention of keeping his promise."]

 def filter_sentences(sentences):
  tokenized_sentences = [word_tokenize(s) for s in sentences]

  regex = re.compile('[%s]' % re.escape(string.punctuation)) 
  tokenized_reports_no_punctuation = []  

  for s in tokenized_sentences:
      
      new_s = []
      for token in s: 
          new_token = regex.sub(u'', token)
          if not new_token == u'':
              new_s.append(new_token.lower())
      
      tokenized_reports_no_punctuation.append(new_s)
      
  print(tokenized_reports_no_punctuation)
  return tokenized_reports_no_punctuation

 filter_sentences(sentences)
	'''
	Basic text data cleaning script
	Tokenization, remove punctuation
	'''

	import sys
	import re
	import string
	from nltk.tokenize import word_tokenize

	sentences = ["A Socrates would perhaps have refused and died in the name of truth.","Hume, however, was not going to allow the stupidity of others to cut his own life short, so he did what any sensible person should do: he went along with their request without any intention of keeping his promise."]

	def filter_sentences(sentences):
	tokenized_sentences = [word_tokenize(s) for s in sentences]

	regex = re.compile('[%s]' % re.escape(string.punctuation))
	tokenized_reports_no_punctuation = []

	for s in tokenized_sentences:

	new_s = []
	for token in s:
	new_token = regex.sub(u'', token)
	if not new_token == u'':
	new_s.append(new_token.lower())

	tokenized_reports_no_punctuation.append(new_s)

	print(tokenized_reports_no_punctuation)
	return tokenized_reports_no_punctuation

	filter_sentences(sentences)