Created
April 12, 2016 03:16
-
-
Save fluffy-critter/aa9983161b2b56f8f2750d661279cb9e to your computer and use it in GitHub Desktop.
Dissociator to generate names by example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# | |
# Silly thing to generate random names from examples. Uses the CSV files obtained from | |
# http://catalog.data.gov/dataset/baby-names-from-social-security-card-applications-national-level-data | |
# | |
# Any corpus will work if it's formatted like: | |
# Name,[ignored],weight | |
import csv | |
import sys | |
import collections | |
import random | |
class Node: | |
def __init__(self): | |
self.total = 0 | |
self.next_nodes = collections.defaultdict(int) | |
nodes = collections.defaultdict(Node) | |
for arg in (sys.argv[1:]): | |
with open(arg, 'r') as file: | |
reader = csv.reader(file) | |
for row in reader: | |
weight = int(row[2]) | |
node = nodes[None] | |
pos = 0 | |
for c in row[0]: | |
node.total += weight | |
node.next_nodes[c] += weight | |
node = nodes[(c,pos)] | |
pos += 1 | |
node.total += weight | |
node.next_nodes[None] += weight | |
# for letter,weights in nodes.items(): | |
# print "{} = {}".format(letter, weights.total) | |
# for nn,wt in weights.next_nodes.items(): | |
# print " -> {} = {}".format(nn, wt) | |
# sys.exit(0) | |
def pick_weighted(node): | |
rnd = random.randint(0, node.total) | |
# print "{} {}".format(node.total, rnd) | |
for k,v in node.next_nodes.items(): | |
rnd -= v | |
if rnd <= 0: | |
return k | |
for x in xrange(200): | |
out = '' | |
node = nodes[None] | |
pos = 0 | |
while True: | |
letter = pick_weighted(node) | |
if not letter: | |
break | |
out += letter | |
node = nodes[(letter, pos)] | |
pos += 1 | |
print out |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment