-
-
Save boredzo/5d0efa528a572ad4aaee278ee8d1f9fa to your computer and use it in GitHub Desktop.
Dissociator to generate names by example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# | |
# Based on https://gist.github.com/plaidfluff/aa9983161b2b56f8f2750d661279cb9e | |
# | |
# Silly thing to generate random names from examples. Uses the CSV files obtained from | |
# http://catalog.data.gov/dataset/baby-names-from-social-security-card-applications-national-level-data | |
# | |
# Any corpus will work if it's formatted like: | |
# Name,[ignored],weight | |
import csv | |
import sys | |
import collections | |
import random | |
import argparse | |
parser = argparse.ArgumentParser(description='Generate one or more random names. Takes data such as http://catalog.data.gov/dataset/baby-names-from-social-security-card-applications-national-level-data as input.') | |
parser.add_argument('-n', '--count', type=int, default=1, help='Number of names to generate.') | |
options, filenames = parser.parse_known_args() | |
if len(filenames) == 0: | |
parser.error('Must pass at least one file containing raw name data like "Sam,G,9999".') | |
class Node: | |
def __init__(self): | |
self.total = 0 | |
self.next_nodes = collections.defaultdict(int) | |
nodes = collections.defaultdict(Node) | |
for arg in filenames: | |
with open(arg, 'r') as file: | |
reader = csv.reader(file) | |
for row in reader: | |
weight = int(row[2]) | |
node = nodes[None] | |
pos = 0 | |
for c in row[0]: | |
node.total += weight | |
node.next_nodes[c] += weight | |
node = nodes[(c,pos)] | |
pos += 1 | |
node.total += weight | |
node.next_nodes[None] += weight | |
# for letter,weights in nodes.items(): | |
# print "{} = {}".format(letter, weights.total) | |
# for nn,wt in weights.next_nodes.items(): | |
# print " -> {} = {}".format(nn, wt) | |
# sys.exit(0) | |
def pick_weighted(node): | |
rnd = random.randint(0, node.total) | |
# print "{} {}".format(node.total, rnd) | |
for k,v in node.next_nodes.items(): | |
rnd -= v | |
if rnd <= 0: | |
return k | |
def make_up_one_name(): | |
out = '' | |
node = nodes[None] | |
pos = 0 | |
while True: | |
letter = pick_weighted(node) | |
if not letter: | |
break | |
out += letter | |
node = nodes[(letter, pos)] | |
pos += 1 | |
return out | |
def make_up_whole_name(): | |
given_name = None | |
middle_name = None | |
family_name = None | |
cutoff = len('Joseana') | |
while given_name is None or family_name is None: | |
new_name = make_up_one_name() | |
if len(new_name) <= cutoff: | |
if given_name is not None: | |
middle_name = new_name | |
else: | |
given_name = new_name | |
else: | |
family_name = new_name | |
return given_name, middle_name, family_name | |
def generate_names(count): | |
while count > 0: | |
yield make_up_whole_name() | |
count = count - 1 | |
for given_name, middle_name, family_name in generate_names(options.count): | |
print(' '.join([given_name] + [middle_name] * bool(middle_name) + [family_name])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Example output: