Last active
October 17, 2022 20:13
-
-
Save ethen8181/d57e762f81aa643744c2ffba5688d33a to your computer and use it in GitHub Desktop.
sklearn & nltk english stopwords
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
The set of stop words when you do this: | |
from nltk.corpus import stopwords | |
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS | |
ENGLISH_STOP_WORDS = set( stopwords.words('english') ).union( set(ENGLISH_STOP_WORDS) ) | |
""" | |
ENGLISH_STOP_WORDS = set([ | |
'a', | |
'about', | |
'above', | |
'across', | |
'after', | |
'afterwards', | |
'again', | |
'against', | |
'ain', | |
'all', | |
'almost', | |
'alone', | |
'along', | |
'already', | |
'also', | |
'although', | |
'always', | |
'am', | |
'among', | |
'amongst', | |
'amoungst', | |
'amount', | |
'an', | |
'and', | |
'another', | |
'any', | |
'anyhow', | |
'anyone', | |
'anything', | |
'anyway', | |
'anywhere', | |
'are', | |
'aren', | |
'around', | |
'as', | |
'at', | |
'back', | |
'be', | |
'became', | |
'because', | |
'become', | |
'becomes', | |
'becoming', | |
'been', | |
'before', | |
'beforehand', | |
'behind', | |
'being', | |
'below', | |
'beside', | |
'besides', | |
'between', | |
'beyond', | |
'bill', | |
'both', | |
'bottom', | |
'but', | |
'by', | |
'call', | |
'can', | |
'cannot', | |
'cant', | |
'co', | |
'con', | |
'could', | |
'couldn', | |
'couldnt', | |
'cry', | |
'd', | |
'de', | |
'describe', | |
'detail', | |
'did', | |
'didn', | |
'do', | |
'does', | |
'doesn', | |
'doing', | |
'don', | |
'done', | |
'down', | |
'due', | |
'during', | |
'each', | |
'eg', | |
'eight', | |
'either', | |
'eleven', | |
'else', | |
'elsewhere', | |
'empty', | |
'enough', | |
'etc', | |
'even', | |
'ever', | |
'every', | |
'everyone', | |
'everything', | |
'everywhere', | |
'except', | |
'few', | |
'fifteen', | |
'fify', | |
'fill', | |
'find', | |
'fire', | |
'first', | |
'five', | |
'for', | |
'former', | |
'formerly', | |
'forty', | |
'found', | |
'four', | |
'from', | |
'front', | |
'full', | |
'further', | |
'get', | |
'give', | |
'go', | |
'had', | |
'hadn', | |
'has', | |
'hasn', | |
'hasnt', | |
'have', | |
'haven', | |
'having', | |
'he', | |
'hence', | |
'her', | |
'here', | |
'hereafter', | |
'hereby', | |
'herein', | |
'hereupon', | |
'hers', | |
'herself', | |
'him', | |
'himself', | |
'his', | |
'how', | |
'however', | |
'hundred', | |
'i', | |
'ie', | |
'if', | |
'in', | |
'inc', | |
'indeed', | |
'interest', | |
'into', | |
'is', | |
'isn', | |
'it', | |
'its', | |
'itself', | |
'just', | |
'keep', | |
'last', | |
'latter', | |
'latterly', | |
'least', | |
'less', | |
'll', | |
'ltd', | |
'm', | |
'ma', | |
'made', | |
'many', | |
'may', | |
'me', | |
'meanwhile', | |
'might', | |
'mightn', | |
'mill', | |
'mine', | |
'more', | |
'moreover', | |
'most', | |
'mostly', | |
'move', | |
'much', | |
'must', | |
'mustn', | |
'my', | |
'myself', | |
'name', | |
'namely', | |
'needn', | |
'neither', | |
'never', | |
'nevertheless', | |
'next', | |
'nine', | |
'no', | |
'nobody', | |
'none', | |
'noone', | |
'nor', | |
'not', | |
'nothing', | |
'now', | |
'nowhere', | |
'o', | |
'of', | |
'off', | |
'often', | |
'on', | |
'once', | |
'one', | |
'only', | |
'onto', | |
'or', | |
'other', | |
'others', | |
'otherwise', | |
'our', | |
'ours', | |
'ourselves', | |
'out', | |
'over', | |
'own', | |
'part', | |
'per', | |
'perhaps', | |
'please', | |
'put', | |
'rather', | |
're', | |
's', | |
'same', | |
'see', | |
'seem', | |
'seemed', | |
'seeming', | |
'seems', | |
'serious', | |
'several', | |
'shan', | |
'she', | |
'should', | |
'shouldn', | |
'show', | |
'side', | |
'since', | |
'sincere', | |
'six', | |
'sixty', | |
'so', | |
'some', | |
'somehow', | |
'someone', | |
'something', | |
'sometime', | |
'sometimes', | |
'somewhere', | |
'still', | |
'such', | |
'system', | |
't', | |
'take', | |
'ten', | |
'than', | |
'that', | |
'the', | |
'their', | |
'theirs', | |
'them', | |
'themselves', | |
'then', | |
'thence', | |
'there', | |
'thereafter', | |
'thereby', | |
'therefore', | |
'therein', | |
'thereupon', | |
'these', | |
'they', | |
'thick', | |
'thin', | |
'third', | |
'this', | |
'those', | |
'though', | |
'three', | |
'through', | |
'throughout', | |
'thru', | |
'thus', | |
'to', | |
'together', | |
'too', | |
'top', | |
'toward', | |
'towards', | |
'twelve', | |
'twenty', | |
'two', | |
'un', | |
'under', | |
'until', | |
'up', | |
'upon', | |
'us', | |
've', | |
'very', | |
'via', | |
'was', | |
'wasn', | |
'we', | |
'well', | |
'were', | |
'weren', | |
'what', | |
'whatever', | |
'when', | |
'whence', | |
'whenever', | |
'where', | |
'whereafter', | |
'whereas', | |
'whereby', | |
'wherein', | |
'whereupon', | |
'wherever', | |
'whether', | |
'which', | |
'while', | |
'whither', | |
'who', | |
'whoever', | |
'whole', | |
'whom', | |
'whose', | |
'why', | |
'will', | |
'with', | |
'within', | |
'without', | |
'won', | |
'would', | |
'wouldn', | |
'y', | |
'yet', | |
'you', | |
'your', | |
'yours', | |
'yourself', | |
'yourselves' | |
]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
thx