Last active
August 29, 2015 14:07
-
-
Save mstaflex/daa51ac2c658867a634c to your computer and use it in GitHub Desktop.
Gist to select names from a database file according to a regex scheme and a country. Mainly for games like "my daughter will be called a name with an a and an u" :P The database file can be found here https://gist.github.com/mstaflex/161edf0c61a764a3345f
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
EAST_WEST_BORDER_COLUMN = 44 | |
REGEX_COMPILED = p = re.compile('([\w]*n[\w]*u[\w]*)|[\w]*u[\w]*n[\w]*') | |
input_file_data_base = "nam_dict.txt" | |
output_file_filtered = "name_database_filtered.txt" | |
output_file = "possible_names_selection.txt" | |
country_column = {30: "Great Britain", 31: "Ireland", 32: "U.S.A.", 33: "Italy", 34: "Malta", 35: "Portugal", 36: "Spain", 37: "France", 38: "Belgium", 39: "Luxembourg", 40: "the Netherlands", 41: "East Frisia", 42: "Germany", 43: "Austria", 44: "Swiss", 45: "Iceland", 46: "Denmark", 47: "Norway", 48: "Sweden", 49: "Finland", 50: "Estonia", 51: "Latvia", 52: "Lithuania", 53: "Poland", 54: "Czech Republic", 55: "Slovakia", 56: "Hungary", 57: "Romania", 58: "Bulgaria", 59: "Bosnia and Herzegovina", 60: "Croatia", 61: "Kosovo", 62: "Macedonia", 63: "Montenegro", 64: "Serbia", 65: "Slovenia", 66: "Albania", 67: "Greece", 68: "Russia", 69: "Belarus", 70: "Moldova", 71: "Ukraine", 72: "Armenia", 73: "Azerbaijan", 74: "Georgia", 75: "Kazakhstan/Uzbekistan,etc.", 76: "Turkey", 77: "Arabia/Persia", 78: "Israel", 79: "Chine", 80: "India/Sri Lanka", 81: "Japan", 82: "Korea", 83: "Vietnam", 84: "others" } | |
selected_countries = ["Germany", "U.S.A.", "France", "Spain", "Austria", "Swiss"] | |
def regex_filter(name): | |
if REGEX_COMPILED.match(name) is None: | |
return False | |
return True | |
def pre_filter(input, output, select_weight=2, gender_filter="M"): | |
with open(input, "r") as f: | |
with open(output, "w") as fo: | |
line = f.readline() | |
while line: | |
line = f.readline() | |
if line.startswith("#"): | |
continue | |
if line.startswith(gender_filter) or line.startswith("?"+gender_filter) or line.startswith("? "): | |
continue | |
pro_cnt = 0 | |
counter_cnt = 0 | |
name = line[3:27].strip() | |
if "+" in name: | |
continue | |
if not regex_filter(name): | |
continue | |
for column in range(30, 30 + len(country_column.keys())): | |
if line[column] in [" ", "+", "-"]: | |
continue | |
val = int(line[column], 16) | |
if country_column[column] in selected_countries: | |
pro_cnt += val | |
else: | |
counter_cnt += val | |
if select_weight * counter_cnt > pro_cnt: | |
continue | |
fo.write("%s\n" % (name)) | |
def uniquify(input, output): | |
with open(input, "r") as f: | |
with open(output, "w") as fo: | |
hashi = {} | |
name = f.readline().strip() | |
while name: | |
try: | |
name = f.readline().strip() | |
hashi[name] = 1 | |
except: | |
raise | |
name = f.readline().strip() | |
for name in hashi.keys(): | |
fo.write("%s\n" % (name)) | |
pre_filter(input_file_data_base, output_file_filtered, select_weight=1) | |
uniquify(output_file_filtered, output_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment