Created
November 11, 2013 01:17
-
-
Save goakley/7406243 to your computer and use it in GitHub Desktop.
Trigram tool that takes text input and creates a random output based on the N-grams of the input.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Trigram tool that takes text input and creates a random output based | |
on the N-grams of the input. | |
""" | |
import argparse | |
from sys import stdin | |
from random import choice | |
class Trigram(object): | |
"""Represents a text N-gram of either words or characters""" | |
def __init__(self, text, count=3, characters=False): | |
self.count = count | |
self.characters = characters | |
tokens = text if characters else text.split() | |
self.start = tokens[:count-1] | |
self._reference = {} | |
for index in range(len(tokens)-count): | |
lookup = tokens[index:index+(count-1)] | |
if not characters: | |
lookup = tuple(lookup) | |
if lookup not in self._reference: | |
self._reference[lookup] = [] | |
self._reference[lookup].append(tokens[index+(count-1)]) | |
def create(self, matches=131072): | |
""" | |
Creates a random string based off of the tokens stored in this Trigram | |
""" | |
if matches > 0: | |
matches += self.count - 2 | |
words = list(self.start) | |
key = self.start if self.characters else tuple(self.start) | |
while key in self._reference: | |
appendix = choice(self._reference[key]) | |
words.append(appendix) | |
if self.characters: | |
key = key[1:] + appendix | |
else: | |
key = list(key) | |
key.pop(0) | |
key.append(appendix) | |
key = tuple(key) | |
if matches > 0 and len(words) > matches: | |
break | |
return ('' if self.characters else ' ').join(words) | |
def __iadd__(self, other): | |
if not isinstance(other, Trigram): | |
raise TypeError("Cannot append a non-Trigram to a Trigram") | |
if self.characters != other.characters: | |
raise TypeError("Cannot combine a character and word Trigram") | |
for ref in other._reference: | |
if ref not in self._reference: | |
self._reference[ref] = other.reference[ref] | |
else: | |
self._reference[ref].extend(other.reference[ref]) | |
return self | |
if __name__ == "__main__": | |
PARSER = argparse.ArgumentParser(description="\ | |
Use trigrams to generate new text\ | |
from input text.") | |
PARSER.add_argument('-c', '--characters', | |
action='store_true', dest='characters', | |
help="splits on characters instead of words") | |
PARSER.add_argument('-w', type=int, default=3, | |
metavar='WIDTH', dest='width', | |
help="number of tokens to split on (default 3)") | |
PARSER.add_argument('-s', type=int, default=131072, | |
metavar='STOP', dest='stop', | |
help="hard stop after s-matches (default 131072,\ | |
0 for no artificial limit)") | |
PARSER.add_argument('file', nargs='*', | |
help="file to use in processing") | |
ARGS = PARSER.parse_args() | |
TRIGRAM = None | |
if ARGS.file: | |
for filename in ARGS.file: | |
with open(filename) as f: | |
inputtext = f.read() | |
if TRIGRAM is None: | |
TRIGRAM = Trigram(inputtext, count=ARGS.width, | |
characters=ARGS.characters) | |
else: | |
TRIGRAM += Trigram(inputtext, count=ARGS.width, | |
characters=ARGS.characters) | |
else: | |
TRIGRAM = Trigram(stdin.read(), count=ARGS.width, | |
characters=ARGS.characters) | |
print(TRIGRAM.create(ARGS.stop)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment