inky · June 8, 2014 12:19
diff --git a/README b/README
 A Python class for filtering a set of English words.

 Wildcard patterns are read from a file named ignore.txt.

 Requires the CLiPS Pattern library: http://www.clips.ua.ac.be/pages/pattern
diff --git a/ignore.txt b/ignore.txt
 # example of an ignore file
 gosh golly
 butt* ass
 belgium
diff --git a/wordfilter.py b/wordfilter.py
 import re
 import sys

 import pattern.en as en


 class WordFilter(object):

    def __init__(self, ignore_file='ignore.txt'):
        object.__init__(self)
        ignore_patterns = set()
        try:
            with open(ignore_file) as fp:
                for line in fp:
                    line = line.split('#', 1)[0].lower()
                    ignore_patterns.update(w.replace('*', '.*')
                                           for w in line.split())
        except IOError:
            pass

        self.regex = re.compile('^(%s)$' % '|'.join(sorted(ignore_patterns)))

    def filter_words(self, words, tuple_index=None, silent=True, print_prefix=''):
        """
        Read a list of wildcard patterns to ignore, then filter a list of words.

        If tuple_index is an integer, each 'word' will be interpreted as
        a tuple, where the actual word string of interest is at tuple_index --
        e.g. tuple_index=1 with a tuple in the form (pos, word, another_string).

        """
        def valid_word(word):
            if not word:
                return False

            re_ignore = self.regex
            if tuple_index is not None:
                word = word[tuple_index]

            if any(re_ignore.match(w) for w in word.split()):
                if not silent:
                    sys.stderr.write('%sIgnoring: %s\n' %
                                     (print_prefix, word))
                return False

            lemma = en.lemma(word)
            if lemma != word and any(re_ignore.match(w) for w in lemma.split()):
                if not silent:
                    sys.stderr.write('%sIgnoring: %s (%s)\n' %
                                     (print_prefix, word, lemma))
                return False

            return True

        return set(filter(valid_word, words))
	A Python class for filtering a set of English words.

	Wildcard patterns are read from a file named ignore.txt.

	Requires the CLiPS Pattern library: http://www.clips.ua.ac.be/pages/pattern
	import re
	import sys

	import pattern.en as en


	class WordFilter(object):

	def __init__(self, ignore_file='ignore.txt'):
	object.__init__(self)
	ignore_patterns = set()
	try:
	with open(ignore_file) as fp:
	for line in fp:
	line = line.split('#', 1)[0].lower()
	ignore_patterns.update(w.replace('', '.')
	for w in line.split())
	except IOError:
	pass

	self.regex = re.compile('^(%s)$' % '\|'.join(sorted(ignore_patterns)))

	def filter_words(self, words, tuple_index=None, silent=True, print_prefix=''):
	"""
	Read a list of wildcard patterns to ignore, then filter a list of words.

	If tuple_index is an integer, each 'word' will be interpreted as
	a tuple, where the actual word string of interest is at tuple_index --
	e.g. tuple_index=1 with a tuple in the form (pos, word, another_string).

	"""
	def valid_word(word):
	if not word:
	return False

	re_ignore = self.regex
	if tuple_index is not None:
	word = word[tuple_index]

	if any(re_ignore.match(w) for w in word.split()):
	if not silent:
	sys.stderr.write('%sIgnoring: %s\n' %
	(print_prefix, word))
	return False

	lemma = en.lemma(word)
	if lemma != word and any(re_ignore.match(w) for w in lemma.split()):
	if not silent:
	sys.stderr.write('%sIgnoring: %s (%s)\n' %
	(print_prefix, word, lemma))
	return False

	return True

	return set(filter(valid_word, words))