Skip to content

Instantly share code, notes, and snippets.

@rohit-gupta
Created October 17, 2017 09:12
Show Gist options
  • Save rohit-gupta/bf51c600d312e11a655b787b197ec15c to your computer and use it in GitHub Desktop.
Save rohit-gupta/bf51c600d312e11a655b787b197ec15c to your computer and use it in GitHub Desktop.
Simple Function to remove English Languages stopwords without using NLTK
from stopwords import remove_stopwords
dummy_string = ("""Mr. and Mrs. Dursley, of number four, Privet Drive, were """
"""proud to say that they were perfectly normal, thank you """
"""very much. They were the last people you'd expect to be """
"""involved in anything strange or mysterious, because they """
"""just didn't hold with such nonsense.""")
# Simple Word tokenizer
dummy_list = dummy_string.replace(",", " ").replace(".", " ").split(" ")
# Remove Stop Words
non_stop_words = remove_stopwords(dummy_list)
# Display texts
print "Original:", dummy_string
print "Non-Stopwords:", non_stop_words
stop_words = ['all', 'over', 'with', 'had', 'to', 'has', 'do', 'very', 'not',
'this', 'some', 'are', 'out', 'for', 'be', 'we', 'by', 'on',
'about', 'of', 'or', 'your', 'from', 'there', 'been', 'was',
'that', 'but', 'he', 'me', 'will', 'my', 'and', 'is', 'it', 'an',
'as', 'at', 'have', 'in', 'no', 'which', 'you', 'a', 'i', 'the',
'', 'just', 'being', 'both', 'through', 'during', 'its',
'before', ' ', 'how', 'should', 'only', 'under', 'ours', 'them',
'his', 'get', 'stop', 'they', 'yourselves', 'now', 'him', 'nor',
'did', 'she', 'each', 'further', 'where', 'few', 'because',
'doing', 'theirs', 'up', 'our', 'ourselves', 'what', 'below',
'does', 'above', 'between', 't', 'after', 'here', 'hers', 'her',
'against', 's', 'own', 'into', 'yourself', 'down', 'would', '&',
'their', 'too', 'then', 'themselves', 'until', 'more', 'himself',
'bus', 'don', 'herself', 'than', 'those', 'myself', 'these', 'whom',
'while', 'can', 'were', 'give', 'am', 'itself', 'any', 'if',
'again', 'when', 'same', 'also', 'other', 'take', 'may', 'who',
'most', 'such', 'why', 'off', 'having', 'so', 'yours', 'once']
def remove_stopwords(raw_words_list):
no_stopwords_list = [
word for word in raw_words_list if word.lower() not in stop_words]
return no_stopwords_list
if __name__ == '__main__':
dummy_string = ("""Mr. and Mrs. Dursley, of number four, Privet Drive, """
"""were proud to say that they were perfectly normal, thank you """
"""very much. They were the last people you'd expect to be """
"""involved in anything strange or mysterious, because they """
"""just didn't hold with such nonsense.""")
# Simple Word tokenizer
dummy_list = dummy_string.replace(","," ").replace("."," ").split(" ")
# Remove Stop Words
non_stop_words = remove_stopwords(dummy_list)
# Display texts
print "Original text:", dummy_string
print "Words which are not stop words:", non_stop_words
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment