Created
March 15, 2018 02:07
-
-
Save elfsternberg/74dbb2204880d9d6aba3a9da1d1cc0be to your computer and use it in GitHub Desktop.
A simple Python library using Markov chains to generate new names.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Takes a filename as its required argument. The file should contain | |
names from a sample list, one name per line. It creates a Markov | |
chain of one, two, and three-letter character sequences, including | |
octothorpe-anchored sequences for the starts of names. It then | |
generates 'n' random names, each using the probabistic distribution of | |
three-letter character sequences, to make names that match the | |
patterns fed in the original. | |
For example, given the 200 most popular girls' names in the USA for | |
the last decade, this script gives the following new and unique names: | |
Adall, Adriah, Alanie, Alliana, Camilian, Charlet, Delila, Destina, | |
Ellian, Emila, Faitlyn, Genelody, Isabeth, Izabelle, Kaith, Kaylor, | |
Kendalyn, Kendalynn, Kennifer, Lailey, Lillison, Madie, Melana, Paisy, | |
Sadison, Scarlotte, Stephine, Valentiny, Vanesis, Viole | |
This is a modified version of the code presented in | |
https://github.com/Tw1ddle/MarkovNameGenerator.git, made more portable | |
(who programs in Haxe voluntarily?) and just a bit streamlined. I am | |
impressed with Sam Twidale's getting around the whole tries thing with | |
a simple map. | |
The original was proposed for generating names for video games, but | |
as a writer I've been using the namelur algorithm forever for generating | |
lists of character names, and this algorithm is much better. | |
""" | |
from typing import Dict, DefaultDict, List, Optional | |
from collections import defaultdict | |
import random | |
class MarkovModel(object): | |
def __init__(self, data: List[str], order: int, prior: float, alphabet: List[str]) -> None: | |
self.order = order | |
self.prior = prior | |
self.alphabet = alphabet | |
self.chains = defaultdict(list) # type: DefaultDict[str, List[float]] | |
self.train(data) | |
@staticmethod | |
def _countMatches(arr: List[str], v: str) -> float: | |
return sum(map(lambda a: 1.0, filter(lambda a: a == v, arr))) | |
def train(self, data: List[str]): | |
observations = defaultdict(list) # type: DefaultDict[str, List[str]] | |
self.chains = defaultdict(list) # type: DefaultDict[str, List[float]] | |
for word in data: | |
item = '#' * self.order + word + '#' | |
for i in range(0, len(item) - self.order): | |
observations[item[i:(i + self.order)]].append(item[i + self.order]) | |
for context in observations: | |
self.chains[context] += [(self.prior + self._countMatches(observations[context], prediction)) | |
for prediction in self.alphabet] | |
@staticmethod | |
def _selectIndex(chain: List[float]) -> int: | |
totals = [] # type: List[float] | |
accumulator = 0.0 | |
for weight in chain: | |
accumulator += weight | |
totals.append(accumulator) | |
seek = random.random() * accumulator | |
for counter, weight in enumerate(totals): | |
if seek < weight: | |
return counter | |
return 0 | |
def generate(self, context: str) -> Optional[str]: | |
if context not in self.chains: | |
return None | |
return self.alphabet[self._selectIndex(self.chains[context])] | |
class MarkovGenerator(object): | |
def __init__(self, data: List[str], order: int, prior: float) -> None: | |
self.order = order | |
self.prior = prior | |
domain = ["#"] + sorted(list(set([l for word in data for l in iter(word)]))) | |
self.models = [MarkovModel(data, self.order - i, self.prior, domain) | |
for i in range(0, self.order)] | |
def generate(self) -> str: | |
word = '#' * self.order | |
letter = self.getLetter(word) | |
while (letter != '#'): | |
if letter is not None: | |
word += letter | |
letter = self.getLetter(word) | |
return word | |
def getLetter(self, context: str) -> Optional[str]: | |
mtext = context[(len(context) - self.order):len(context)] | |
letter = None | |
for model in self.models: | |
letter = model.generate(mtext) | |
if letter is not None: | |
return letter | |
mtext = mtext[0] | |
return letter | |
class MarkovNameGenerator(object): | |
def __init__(self, data: List[str], order: int, prior: float) -> None: | |
self.generator = MarkovGenerator(data, order, prior) | |
def generateName(self, | |
minLength: int = 5, | |
maxLength: int = 10, | |
startsWith: str = "", | |
endsWith: str = "", | |
includes: str = "", | |
excludes: str = "") -> Optional[str]: | |
name = self.generator.generate().replace('#', '') | |
if (len(name) >= minLength and | |
len(name) <= maxLength and | |
includes in name and | |
(excludes == "" or excludes not in name) and | |
name.startswith(startsWith) and | |
name.endswith(endsWith)): | |
return name | |
return None | |
def generateNames(self, | |
count, | |
minLength: int = 5, | |
maxLength: int = 10, | |
startsWith: str = "", | |
endsWith: str = "", | |
includes: str = "", | |
excludes: str = "") -> List[str]: | |
names = [] # type: List[str] | |
while len(names) < count: | |
name = self.generateName(minLength, maxLength, startsWith, | |
endsWith, includes, excludes) | |
if name: | |
names.append(name) | |
return names | |
if __name__ == '__main__': | |
import sys | |
datafile = sys.argv[1] | |
count = 10 | |
if len(sys.argv) > 2: | |
count = int(sys.argv[2]) | |
data = [] # type: List[str] | |
with open(datafile, "r") as d: | |
data = list(map(str.strip, d.readlines())) | |
generator = MarkovNameGenerator(data, 3, 0.0) | |
print("\n".join(generator.generateNames(count))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment