Created
June 30, 2016 10:45
-
-
Save manuchandel/20d0b1430e00455bc99302cf4d2a3819 to your computer and use it in GitHub Desktop.
contains English stop words, question words, first person words. can be used for NLP
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#set of stop words in english language | |
stop_words=set(['a', 'about', 'above', 'across', 'after', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'among', 'an', 'and', 'another', 'any', 'anybody', 'anyone', 'anything', 'anywhere', 'are', 'area', 'areas', 'around', 'as', 'ask', 'asked', 'asking', 'asks', 'at', 'away', | |
'back', 'backed', 'backing', 'backs', 'be', 'became', 'because', 'become', 'becomes', 'been', 'before', 'began', 'behind', 'being', 'beings', 'best', 'better', 'between', 'big', 'both', 'but', 'by', | |
'came', 'can', 'cannot', 'case', 'cases', 'certain', 'certainly', 'clear', 'clearly', 'come', 'could', | |
'did', 'differ', 'different', 'differently', 'do', 'does', 'done', 'down', 'down', 'downed', 'downing', 'downs', 'during', | |
'each', 'early', 'either', 'end', 'ended', 'ending', 'ends', 'enough', 'even', 'evenly', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', | |
'face', 'faces', 'fact', 'facts', 'far', 'felt', 'few', 'find', 'finds', 'first', 'follow', 'for', 'four', 'from', 'full', 'fully', 'further', 'furthered', 'furthering', 'furthers', | |
'gave', 'general', 'generally', 'get', 'gets', 'give', 'given', 'gives', 'glad', 'go', 'going', 'good', 'goods', 'got', 'great', 'greater', 'greatest', 'group', 'grouped', 'grouping', 'groups', | |
'had', 'has', 'have', 'having', 'he', 'her', 'here', 'herself', 'hi', 'high', 'high', 'high', 'higher', 'highest', 'him', 'himself', 'his', 'hmm', 'hooray', 'how', 'however', 'huh', | |
'i', 'if', 'important', 'in', 'interest', 'interested', 'interesting', 'interests', 'into', 'is', 'it', 'its', 'itself', | |
'just', | |
'keep', 'keeps', 'kind', 'knew', 'know', 'known', 'knows', 'large', 'largely', | |
'last', 'later', 'latest', 'least', 'less', 'let', 'lets', 'like', 'likely', 'long', 'longer', 'longest', | |
'made', 'make', 'making', 'man', 'many', 'may', 'me', 'member', 'members', 'men', 'might', 'more', 'most', 'mostly', 'mr', 'mrs', 'much', 'must', 'my', 'myself', | |
'necessary', 'need', 'needed', 'needing', 'needs', 'never', 'new', 'new', 'newer', 'newest', 'next', 'no', 'nobody', 'non', 'noone', 'not', 'nothing', 'nowhere', 'number', 'numbers', | |
'of', 'off', 'often', 'oh', 'old', 'older', 'oldest', 'on', 'once', 'one', 'only', 'open', 'opened', 'opening', 'opens', 'or', 'order', 'ordered', 'ordering', 'orders', 'other', 'others', 'our', 'out', 'over', | |
'part', 'parted', 'parting', 'parts', 'per', 'perhaps', 'place', 'places', 'point', 'pointed', 'pointing', 'points', 'possible', 'present', 'presented', 'presenting', 'presents', 'problem', 'problems', 'put', 'puts', | |
'quite', | |
'rather', 'really', 'right', 'right', 'room', 'rooms', | |
'said', 'same', 'saw', 'say', 'says', 'second', 'seconds', 'see', 'seem', 'seemed', 'seeming', 'seems', 'sees', 'several', 'shall', 'she', 'should', 'show', 'showed', 'showing', 'shows', 'side', 'sides', 'since', 'small', 'smaller', 'smallest', 'so', 'some', 'somebody', 'someone', 'something', 'somewhere', 'state', 'states', 'still', 'still', 'such', 'sure', | |
'take', 'taken', 'than', 'that', 'the', 'their', 'them', 'then', 'there', 'therefore', 'these', 'they', 'thing', 'things', 'think', 'thinks', 'this', 'those', 'though', 'thought', 'thoughts', 'three', 'through', 'thus', 'today', 'together', 'too', 'took', 'toward', 'turn', 'turned', 'turning', 'turns', 'two', | |
'uh', 'under', 'until', 'up', 'upon', 'us', 'use', 'used', 'uses', | |
'very', | |
'want', 'wanted', 'wanting', 'wants', 'was', 'way', 'ways', 'we', 'well', 'wells', 'went', 'were', 'what', 'when', 'where', 'whether', 'which', 'while', 'who', 'whole', 'whose', 'why', 'will', 'with', 'within', 'without', 'work', 'worked', 'working', 'works', 'would', | |
'year', 'years', 'yet', 'you', 'young', 'younger', 'youngest', 'your', 'yours']) | |
#set of first person words in english language | |
first_person_words=set(['am', 'i', 'm', 'me', 'mine', 'my', 'myself', 'our', 'ours', 'ourself', 'ourselves', 'we']) | |
#set of second person words in english language | |
second_person_words=set(['you', 'your', 'yours', 'yourself', 'yourselves']) | |
#set of third person words in english language | |
third_person_words=set(['he', 'her', 'hers', 'herself', 'him', 'himself', 'his', 'it', 'its', 'itself', 'she', 'their', 'theirs', 'themself', 'themselves', 'they']) | |
#set of question words in english language | |
question_words=set(['how', 'what', 'when', 'where', 'which', 'who', 'whom', 'why']) | |
#regular expression to tokenize sentence into words | |
words_regex='[\\.|,|\\(|\\)|\\/|\\|\'|"|\n| |!|\\?|:|;|\\-|=|@|#]+' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment