Created
June 8, 2014 12:19
-
-
Save inky/e143a23d54f2e8907c0f to your computer and use it in GitHub Desktop.
WordFilter for bot corpora
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| A Python class for filtering a set of English words. | |
| Wildcard patterns are read from a file named ignore.txt. | |
| Requires the CLiPS Pattern library: http://www.clips.ua.ac.be/pages/pattern |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # example of an ignore file | |
| gosh golly | |
| butt* ass | |
| belgium |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| import sys | |
| import pattern.en as en | |
| class WordFilter(object): | |
| def __init__(self, ignore_file='ignore.txt'): | |
| object.__init__(self) | |
| ignore_patterns = set() | |
| try: | |
| with open(ignore_file) as fp: | |
| for line in fp: | |
| line = line.split('#', 1)[0].lower() | |
| ignore_patterns.update(w.replace('*', '.*') | |
| for w in line.split()) | |
| except IOError: | |
| pass | |
| self.regex = re.compile('^(%s)$' % '|'.join(sorted(ignore_patterns))) | |
| def filter_words(self, words, tuple_index=None, silent=True, print_prefix=''): | |
| """ | |
| Read a list of wildcard patterns to ignore, then filter a list of words. | |
| If tuple_index is an integer, each 'word' will be interpreted as | |
| a tuple, where the actual word string of interest is at tuple_index -- | |
| e.g. tuple_index=1 with a tuple in the form (pos, word, another_string). | |
| """ | |
| def valid_word(word): | |
| if not word: | |
| return False | |
| re_ignore = self.regex | |
| if tuple_index is not None: | |
| word = word[tuple_index] | |
| if any(re_ignore.match(w) for w in word.split()): | |
| if not silent: | |
| sys.stderr.write('%sIgnoring: %s\n' % | |
| (print_prefix, word)) | |
| return False | |
| lemma = en.lemma(word) | |
| if lemma != word and any(re_ignore.match(w) for w in lemma.split()): | |
| if not silent: | |
| sys.stderr.write('%sIgnoring: %s (%s)\n' % | |
| (print_prefix, word, lemma)) | |
| return False | |
| return True | |
| return set(filter(valid_word, words)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment