Skip to content

Instantly share code, notes, and snippets.

@inky
Created June 8, 2014 12:19
Show Gist options
  • Save inky/e143a23d54f2e8907c0f to your computer and use it in GitHub Desktop.
Save inky/e143a23d54f2e8907c0f to your computer and use it in GitHub Desktop.
WordFilter for bot corpora
A Python class for filtering a set of English words.
Wildcard patterns are read from a file named ignore.txt.
Requires the CLiPS Pattern library: http://www.clips.ua.ac.be/pages/pattern
# example of an ignore file
gosh golly
butt* ass
belgium
import re
import sys
import pattern.en as en
class WordFilter(object):
def __init__(self, ignore_file='ignore.txt'):
object.__init__(self)
ignore_patterns = set()
try:
with open(ignore_file) as fp:
for line in fp:
line = line.split('#', 1)[0].lower()
ignore_patterns.update(w.replace('*', '.*')
for w in line.split())
except IOError:
pass
self.regex = re.compile('^(%s)$' % '|'.join(sorted(ignore_patterns)))
def filter_words(self, words, tuple_index=None, silent=True, print_prefix=''):
"""
Read a list of wildcard patterns to ignore, then filter a list of words.
If tuple_index is an integer, each 'word' will be interpreted as
a tuple, where the actual word string of interest is at tuple_index --
e.g. tuple_index=1 with a tuple in the form (pos, word, another_string).
"""
def valid_word(word):
if not word:
return False
re_ignore = self.regex
if tuple_index is not None:
word = word[tuple_index]
if any(re_ignore.match(w) for w in word.split()):
if not silent:
sys.stderr.write('%sIgnoring: %s\n' %
(print_prefix, word))
return False
lemma = en.lemma(word)
if lemma != word and any(re_ignore.match(w) for w in lemma.split()):
if not silent:
sys.stderr.write('%sIgnoring: %s (%s)\n' %
(print_prefix, word, lemma))
return False
return True
return set(filter(valid_word, words))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment