Skip to content

Instantly share code, notes, and snippets.

@Steboss89
Last active April 8, 2022 15:30
Show Gist options
  • Save Steboss89/73e1f3bbf1a62403ccad53f7e63a6581 to your computer and use it in GitHub Desktop.
Save Steboss89/73e1f3bbf1a62403ccad53f7e63a6581 to your computer and use it in GitHub Desktop.
cleaning of input text data
data = []
ifiles = glob.glob("books/*.txt")
for ifile in ifiles:
book = open(ifile, "r").read().strip()
data.append(book)
stop_words = stopwords.words('english')
stop_words.extend(["thy","thou","thee", "hath", "upon", "me", "him", "them", "shall","ye", "one", "unto", "us"])
def remove_stopwords(text, stop_words):
outtext = ' '.join([word for word in text.split() if word not in stop_words])
return outtext
for i, book in enumerate(data, 0):
# remove NUMBER:NUMBER. pattern at the beginning
data[i] = re.sub(r"\d{1,}\:\d{1,}\.", "",data[i])
# remove NAME Chapter NUMBER
data[i] = re.sub(r"\w{1,} Chapter \d{1,}","",data[i] )
#lower case
data[i] = data[i].lower()
# remove punctuation
data[i] = data[i].translate(str.maketrans('', '', string.punctuation))
# remove new lines
data[i] = re.sub('\s+', " ", data[i])
# remove new line
data[i] = re.sub(r"\\n", " ", data[i])
# remove stopwords
data[i] = ' '.join([word for word in data[i].split() if word not in stop_words]) #remove_stopwords(data[i], stop_words)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment