Created
October 17, 2017 09:12
-
-
Save rohit-gupta/bf51c600d312e11a655b787b197ec15c to your computer and use it in GitHub Desktop.
Simple Function to remove English Languages stopwords without using NLTK
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from stopwords import remove_stopwords | |
dummy_string = ("""Mr. and Mrs. Dursley, of number four, Privet Drive, were """ | |
"""proud to say that they were perfectly normal, thank you """ | |
"""very much. They were the last people you'd expect to be """ | |
"""involved in anything strange or mysterious, because they """ | |
"""just didn't hold with such nonsense.""") | |
# Simple Word tokenizer | |
dummy_list = dummy_string.replace(",", " ").replace(".", " ").split(" ") | |
# Remove Stop Words | |
non_stop_words = remove_stopwords(dummy_list) | |
# Display texts | |
print "Original:", dummy_string | |
print "Non-Stopwords:", non_stop_words |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
stop_words = ['all', 'over', 'with', 'had', 'to', 'has', 'do', 'very', 'not', | |
'this', 'some', 'are', 'out', 'for', 'be', 'we', 'by', 'on', | |
'about', 'of', 'or', 'your', 'from', 'there', 'been', 'was', | |
'that', 'but', 'he', 'me', 'will', 'my', 'and', 'is', 'it', 'an', | |
'as', 'at', 'have', 'in', 'no', 'which', 'you', 'a', 'i', 'the', | |
'', 'just', 'being', 'both', 'through', 'during', 'its', | |
'before', ' ', 'how', 'should', 'only', 'under', 'ours', 'them', | |
'his', 'get', 'stop', 'they', 'yourselves', 'now', 'him', 'nor', | |
'did', 'she', 'each', 'further', 'where', 'few', 'because', | |
'doing', 'theirs', 'up', 'our', 'ourselves', 'what', 'below', | |
'does', 'above', 'between', 't', 'after', 'here', 'hers', 'her', | |
'against', 's', 'own', 'into', 'yourself', 'down', 'would', '&', | |
'their', 'too', 'then', 'themselves', 'until', 'more', 'himself', | |
'bus', 'don', 'herself', 'than', 'those', 'myself', 'these', 'whom', | |
'while', 'can', 'were', 'give', 'am', 'itself', 'any', 'if', | |
'again', 'when', 'same', 'also', 'other', 'take', 'may', 'who', | |
'most', 'such', 'why', 'off', 'having', 'so', 'yours', 'once'] | |
def remove_stopwords(raw_words_list): | |
no_stopwords_list = [ | |
word for word in raw_words_list if word.lower() not in stop_words] | |
return no_stopwords_list | |
if __name__ == '__main__': | |
dummy_string = ("""Mr. and Mrs. Dursley, of number four, Privet Drive, """ | |
"""were proud to say that they were perfectly normal, thank you """ | |
"""very much. They were the last people you'd expect to be """ | |
"""involved in anything strange or mysterious, because they """ | |
"""just didn't hold with such nonsense.""") | |
# Simple Word tokenizer | |
dummy_list = dummy_string.replace(","," ").replace("."," ").split(" ") | |
# Remove Stop Words | |
non_stop_words = remove_stopwords(dummy_list) | |
# Display texts | |
print "Original text:", dummy_string | |
print "Words which are not stop words:", non_stop_words |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment