Skip to content

Instantly share code, notes, and snippets.

@goakley
Created November 11, 2013 01:17
Show Gist options
  • Save goakley/7406243 to your computer and use it in GitHub Desktop.
Save goakley/7406243 to your computer and use it in GitHub Desktop.
Trigram tool that takes text input and creates a random output based on the N-grams of the input.
#!/usr/bin/env python
"""
Trigram tool that takes text input and creates a random output based
on the N-grams of the input.
"""
import argparse
from sys import stdin
from random import choice
class Trigram(object):
"""Represents a text N-gram of either words or characters"""
def __init__(self, text, count=3, characters=False):
self.count = count
self.characters = characters
tokens = text if characters else text.split()
self.start = tokens[:count-1]
self._reference = {}
for index in range(len(tokens)-count):
lookup = tokens[index:index+(count-1)]
if not characters:
lookup = tuple(lookup)
if lookup not in self._reference:
self._reference[lookup] = []
self._reference[lookup].append(tokens[index+(count-1)])
def create(self, matches=131072):
"""
Creates a random string based off of the tokens stored in this Trigram
"""
if matches > 0:
matches += self.count - 2
words = list(self.start)
key = self.start if self.characters else tuple(self.start)
while key in self._reference:
appendix = choice(self._reference[key])
words.append(appendix)
if self.characters:
key = key[1:] + appendix
else:
key = list(key)
key.pop(0)
key.append(appendix)
key = tuple(key)
if matches > 0 and len(words) > matches:
break
return ('' if self.characters else ' ').join(words)
def __iadd__(self, other):
if not isinstance(other, Trigram):
raise TypeError("Cannot append a non-Trigram to a Trigram")
if self.characters != other.characters:
raise TypeError("Cannot combine a character and word Trigram")
for ref in other._reference:
if ref not in self._reference:
self._reference[ref] = other.reference[ref]
else:
self._reference[ref].extend(other.reference[ref])
return self
if __name__ == "__main__":
PARSER = argparse.ArgumentParser(description="\
Use trigrams to generate new text\
from input text.")
PARSER.add_argument('-c', '--characters',
action='store_true', dest='characters',
help="splits on characters instead of words")
PARSER.add_argument('-w', type=int, default=3,
metavar='WIDTH', dest='width',
help="number of tokens to split on (default 3)")
PARSER.add_argument('-s', type=int, default=131072,
metavar='STOP', dest='stop',
help="hard stop after s-matches (default 131072,\
0 for no artificial limit)")
PARSER.add_argument('file', nargs='*',
help="file to use in processing")
ARGS = PARSER.parse_args()
TRIGRAM = None
if ARGS.file:
for filename in ARGS.file:
with open(filename) as f:
inputtext = f.read()
if TRIGRAM is None:
TRIGRAM = Trigram(inputtext, count=ARGS.width,
characters=ARGS.characters)
else:
TRIGRAM += Trigram(inputtext, count=ARGS.width,
characters=ARGS.characters)
else:
TRIGRAM = Trigram(stdin.read(), count=ARGS.width,
characters=ARGS.characters)
print(TRIGRAM.create(ARGS.stop))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment