Last active
July 18, 2017 15:59
-
-
Save johannestaas/aca467f3b56dc93814f2d2549ba99866 to your computer and use it in GitHub Desktop.
Namerator - Name generator PoC based on text patterns
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
namerator | |
========= | |
Name generator, by generating characters based on frequencies of letter patterns | |
Usage: | |
$ python namerator.py -n 5 elf_names.txt | |
egil-gonamilin | |
taedhorilie | |
lalorahinduil | |
cirduiel | |
amrothrie | |
License: GPLv3+ | |
See https://www.gnu.org/licenses/gpl-3.0.en.html | |
''' | |
from math import sqrt | |
from random import gauss, seed, choice | |
from collections import Counter | |
def load(path): | |
with open(path) as f: | |
return [x.strip() for x in f.readlines() if x.strip()] | |
def chop_name(name): | |
lst = [x for x in name.lower().strip().split()[0]] | |
return ['', '', '', ''] + lst + ['$'] | |
def inc_frequencies(chop, freqs): | |
freqs[0] = freqs.get(0, Counter()) | |
for i in range(1, 5): | |
freqs[i] = freqs.get(i, {}) | |
for i in range(3, len(chop)): | |
a, b, c, d = chop[i-4], chop[i-3], chop[i-2], chop[i-1] | |
char = chop[i] | |
freqs[0].update([char]) | |
freqs[1][d] = freqs[1].get(d, Counter()) | |
freqs[1][d].update([char]) | |
freqs[2][c + d] = freqs[2].get(c + d, Counter()) | |
freqs[2][c + d].update([char]) | |
freqs[3][b + c + d] = freqs[3].get(b + c + d, Counter()) | |
freqs[3][b + c + d].update([char]) | |
freqs[4][a + b + c + d] = freqs[4].get(a + b + c + d, Counter()) | |
freqs[4][a + b + c + d].update([char]) | |
return freqs | |
def calc_frequencies(names): | |
freqs = {'len': Counter()} | |
for name in names: | |
freqs['len'].update([len(name)]) | |
chop = chop_name(name) | |
inc_frequencies(chop, freqs) | |
return freqs | |
def combined_frequencies(freqs, last1, last2, last3, last4): | |
freq = Counter() | |
freq.update(freqs[1][last1]) | |
for i in range(5): | |
freq.update(freqs[2].get(last2, [])) | |
for i in range(10): | |
freq.update(freqs[3].get(last3, [])) | |
for i in range(25): | |
freq.update(freqs[4].get(last4, [])) | |
return freq.most_common() | |
def choose(common, over=0, letter_freqs=None): | |
lst = [] | |
for char, num in common: | |
if over < 0 and char == '$': | |
continue | |
lst += [char] * num | |
if over >= 0: | |
lst += ['$'] * dict(common).get('$', 1) * (over + 1) | |
if not lst: | |
return choose(letter_freqs, over=over) | |
return choice(lst) | |
def calc_gauss(lens): | |
expanded = [] | |
for l, num in lens: | |
expanded += [l] * num | |
mean = sum(expanded) / len(expanded) | |
sigma = sum((mean - x)**2 for x in expanded) | |
sigma /= len(expanded) - 1 | |
sigma = sqrt(sigma) | |
return mean, sigma | |
def generate(freqs): | |
name = '' | |
last1 = '' | |
last2 = '' | |
last3 = '' | |
last4 = '' | |
shortest = min(x for x, y in freqs['len'].most_common()) | |
# longest = max(x for x, y in freqs['len'].most_common()) | |
mean, sigma = calc_gauss(freqs['len'].most_common()) | |
namelen = int(max(gauss(mean, sigma), shortest)) | |
while True: | |
combined = combined_frequencies(freqs, last1, last2, last3, last4) | |
next_letter = choose(combined, over=len(name) - namelen, | |
letter_freqs=freqs[0].most_common()) | |
if next_letter == '$': | |
break | |
name += next_letter | |
last4 = last3 + next_letter | |
last3 = last2 + next_letter | |
last2 = last1 + next_letter | |
last1 = next_letter | |
return name | |
def main(): | |
import argparse | |
import json | |
parser = argparse.ArgumentParser() | |
parser.add_argument('path') | |
parser.add_argument('--num', '-n', type=int, default=10) | |
parser.add_argument('--freq-output', '-f') | |
args = parser.parse_args() | |
names = load(args.path) | |
freqs = calc_frequencies(names) | |
if args.freq_output: | |
dump = {} | |
dump['len'] = freqs['len'].most_common() | |
dump['frequency'] = freqs[0].most_common() | |
dump['first'] = {k: v.most_common() for k, v in freqs[1].items()} | |
dump['second'] = {k: v.most_common() for k, v in freqs[2].items()} | |
dump['third'] = {k: v.most_common() for k, v in freqs[3].items()} | |
dump['fourth'] = {k: v.most_common() for k, v in freqs[4].items()} | |
with open(args.freq_output, 'w') as f: | |
json.dump(dump, f, indent=4) | |
print('Dumped frequencies to {args.freq_output}'.format(args=args)) | |
ct = 0 | |
names = set(names) | |
while ct < args.num: | |
name = generate(freqs) | |
if name in names: | |
seed() | |
continue | |
print(name) | |
ct += 1 | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
azaghâl | |
balin | |
bifur | |
bofur | |
bombur | |
borin | |
dáin | |
dís | |
dori | |
durin | |
dwalin | |
farin | |
fíli | |
flói | |
frár | |
frerin | |
frór | |
fundin | |
gamil | |
gimli | |
glóin | |
gróin | |
grór | |
ibûn | |
khîm | |
kíli | |
lóni | |
mîm | |
náin | |
náli | |
nár | |
narvi | |
nori | |
óin | |
ori | |
telchar | |
thorin | |
thráin | |
thrór |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
aegnor | |
amarië | |
amdír | |
amras | |
amrod | |
amroth | |
anairë | |
angrod | |
annael | |
aranwë | |
aredhel | |
argon | |
arminas | |
beleg | |
caranthir | |
celeborn | |
celebrían | |
celebrimbor | |
celegorm | |
círdan | |
curufin | |
daeron | |
denethor | |
eärwen | |
ecthelion | |
edrahil | |
egalmoth | |
eldalótë | |
elemmakil | |
elemmírë | |
elenwë | |
elmo | |
enel | |
enerdhil | |
eöl | |
erestor | |
fëanor | |
finarfin | |
findis | |
finduilas | |
fingolfin | |
fingon | |
finrod | |
finwë | |
galadhon | |
galadriel | |
galathil | |
galdor | |
galion | |
gelmir | |
gelmir | |
gildor | |
gil-galad | |
glorfindel | |
glorfindel | |
guilin | |
gwindor | |
haldir | |
idril | |
imin | |
indis | |
ingwë | |
ingwion | |
irimë | |
legolas | |
lenwë | |
lúthien | |
mablung | |
maedhros | |
maeglin | |
maglor | |
mahtan | |
míriel | |
mithrellas | |
nellas | |
nerdanel | |
nimloth | |
olwë | |
orodreth | |
oropher | |
orophin | |
pengolodh | |
rúmil | |
rúmil | |
tata | |
thingol | |
thranduil | |
turgon | |
voronwë |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment