Last active
April 8, 2022 15:30
-
-
Save Steboss89/73e1f3bbf1a62403ccad53f7e63a6581 to your computer and use it in GitHub Desktop.
cleaning of input text data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| data = [] | |
| ifiles = glob.glob("books/*.txt") | |
| for ifile in ifiles: | |
| book = open(ifile, "r").read().strip() | |
| data.append(book) | |
| stop_words = stopwords.words('english') | |
| stop_words.extend(["thy","thou","thee", "hath", "upon", "me", "him", "them", "shall","ye", "one", "unto", "us"]) | |
| def remove_stopwords(text, stop_words): | |
| outtext = ' '.join([word for word in text.split() if word not in stop_words]) | |
| return outtext | |
| for i, book in enumerate(data, 0): | |
| # remove NUMBER:NUMBER. pattern at the beginning | |
| data[i] = re.sub(r"\d{1,}\:\d{1,}\.", "",data[i]) | |
| # remove NAME Chapter NUMBER | |
| data[i] = re.sub(r"\w{1,} Chapter \d{1,}","",data[i] ) | |
| #lower case | |
| data[i] = data[i].lower() | |
| # remove punctuation | |
| data[i] = data[i].translate(str.maketrans('', '', string.punctuation)) | |
| # remove new lines | |
| data[i] = re.sub('\s+', " ", data[i]) | |
| # remove new line | |
| data[i] = re.sub(r"\\n", " ", data[i]) | |
| # remove stopwords | |
| data[i] = ' '.join([word for word in data[i].split() if word not in stop_words]) #remove_stopwords(data[i], stop_words) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment