goakley · November 11, 2013 01:17
diff --git a/trigram.py b/trigram.py
 #!/usr/bin/env python

 """
 Trigram tool that takes text input and creates a random output based
 on the N-grams of the input.
 """

 import argparse
 from sys import stdin
 from random import choice


 class Trigram(object):
    """Represents a text N-gram of either words or characters"""
    def __init__(self, text, count=3, characters=False):
        self.count = count
        self.characters = characters
        tokens = text if characters else text.split()
        self.start = tokens[:count-1]
        self._reference = {}
        for index in range(len(tokens)-count):
            lookup = tokens[index:index+(count-1)]
            if not characters:
                lookup = tuple(lookup)
            if lookup not in self._reference:
                self._reference[lookup] = []
            self._reference[lookup].append(tokens[index+(count-1)])

    def create(self, matches=131072):
        """
        Creates a random string based off of the tokens stored in this Trigram
        """
        if matches > 0:
            matches += self.count - 2
        words = list(self.start)
        key = self.start if self.characters else tuple(self.start)
        while key in self._reference:
            appendix = choice(self._reference[key])
            words.append(appendix)
            if self.characters:
                key = key[1:] + appendix
            else:
                key = list(key)
                key.pop(0)
                key.append(appendix)
                key = tuple(key)
            if matches > 0 and len(words) > matches:
                break
        return ('' if self.characters else ' ').join(words)

    def __iadd__(self, other):
        if not isinstance(other, Trigram):
            raise TypeError("Cannot append a non-Trigram to a Trigram")
        if self.characters != other.characters:
            raise TypeError("Cannot combine a character and word Trigram")
        for ref in other._reference:
            if ref not in self._reference:
                self._reference[ref] = other.reference[ref]
            else:
                self._reference[ref].extend(other.reference[ref])
        return self


 if __name__ == "__main__":
    PARSER = argparse.ArgumentParser(description="\
                                     Use trigrams to generate new text\
                                     from input text.")
    PARSER.add_argument('-c', '--characters',
                        action='store_true', dest='characters',
                        help="splits on characters instead of words")
    PARSER.add_argument('-w', type=int, default=3,
                        metavar='WIDTH', dest='width',
                        help="number of tokens to split on (default 3)")
    PARSER.add_argument('-s', type=int, default=131072,
                        metavar='STOP', dest='stop',
                        help="hard stop after s-matches (default 131072,\
                        0 for no artificial limit)")
    PARSER.add_argument('file', nargs='*',
                        help="file to use in processing")
    ARGS = PARSER.parse_args()
    TRIGRAM = None
    if ARGS.file:
        for filename in ARGS.file:
            with open(filename) as f:
                inputtext = f.read()
                if TRIGRAM is None:
                    TRIGRAM = Trigram(inputtext, count=ARGS.width,
                                      characters=ARGS.characters)
                else:
                    TRIGRAM += Trigram(inputtext, count=ARGS.width,
                                       characters=ARGS.characters)
    else:
        TRIGRAM = Trigram(stdin.read(), count=ARGS.width,
                          characters=ARGS.characters)
    print(TRIGRAM.create(ARGS.stop))
	#!/usr/bin/env python

	"""
	Trigram tool that takes text input and creates a random output based
	on the N-grams of the input.
	"""

	import argparse
	from sys import stdin
	from random import choice


	class Trigram(object):
	"""Represents a text N-gram of either words or characters"""
	def __init__(self, text, count=3, characters=False):
	self.count = count
	self.characters = characters
	tokens = text if characters else text.split()
	self.start = tokens[:count-1]
	self._reference = {}
	for index in range(len(tokens)-count):
	lookup = tokens[index:index+(count-1)]
	if not characters:
	lookup = tuple(lookup)
	if lookup not in self._reference:
	self._reference[lookup] = []
	self._reference[lookup].append(tokens[index+(count-1)])

	def create(self, matches=131072):
	"""
	Creates a random string based off of the tokens stored in this Trigram
	"""
	if matches > 0:
	matches += self.count - 2
	words = list(self.start)
	key = self.start if self.characters else tuple(self.start)
	while key in self._reference:
	appendix = choice(self._reference[key])
	words.append(appendix)
	if self.characters:
	key = key[1:] + appendix
	else:
	key = list(key)
	key.pop(0)
	key.append(appendix)
	key = tuple(key)
	if matches > 0 and len(words) > matches:
	break
	return ('' if self.characters else ' ').join(words)

	def __iadd__(self, other):
	if not isinstance(other, Trigram):
	raise TypeError("Cannot append a non-Trigram to a Trigram")
	if self.characters != other.characters:
	raise TypeError("Cannot combine a character and word Trigram")
	for ref in other._reference:
	if ref not in self._reference:
	self._reference[ref] = other.reference[ref]
	else:
	self._reference[ref].extend(other.reference[ref])
	return self


	if __name__ == "__main__":
	PARSER = argparse.ArgumentParser(description="\
	Use trigrams to generate new text\
	from input text.")
	PARSER.add_argument('-c', '--characters',
	action='store_true', dest='characters',
	help="splits on characters instead of words")
	PARSER.add_argument('-w', type=int, default=3,
	metavar='WIDTH', dest='width',
	help="number of tokens to split on (default 3)")
	PARSER.add_argument('-s', type=int, default=131072,
	metavar='STOP', dest='stop',
	help="hard stop after s-matches (default 131072,\
	0 for no artificial limit)")
	PARSER.add_argument('file', nargs='*',
	help="file to use in processing")
	ARGS = PARSER.parse_args()
	TRIGRAM = None
	if ARGS.file:
	for filename in ARGS.file:
	with open(filename) as f:
	inputtext = f.read()
	if TRIGRAM is None:
	TRIGRAM = Trigram(inputtext, count=ARGS.width,
	characters=ARGS.characters)
	else:
	TRIGRAM += Trigram(inputtext, count=ARGS.width,
	characters=ARGS.characters)
	else:
	TRIGRAM = Trigram(stdin.read(), count=ARGS.width,
	characters=ARGS.characters)
	print(TRIGRAM.create(ARGS.stop))