mstaflex · August 29, 2015 14:07
diff --git a/name_filter.py b/name_filter.py
 import re

 EAST_WEST_BORDER_COLUMN = 44
 REGEX_COMPILED = p = re.compile('([\w]*n[\w]*u[\w]*)|[\w]*u[\w]*n[\w]*')

 input_file_data_base = "nam_dict.txt"
 output_file_filtered = "name_database_filtered.txt"
 output_file = "possible_names_selection.txt"

 country_column = {30: "Great Britain", 31: "Ireland", 32: "U.S.A.", 33: "Italy", 34: "Malta", 35: "Portugal", 36: "Spain", 37: "France", 38: "Belgium", 39: "Luxembourg", 40: "the Netherlands", 41: "East Frisia", 42: "Germany", 43: "Austria", 44: "Swiss", 45: "Iceland", 46: "Denmark", 47: "Norway", 48: "Sweden", 49: "Finland", 50: "Estonia", 51: "Latvia", 52: "Lithuania", 53: "Poland", 54: "Czech Republic", 55: "Slovakia", 56: "Hungary", 57: "Romania", 58: "Bulgaria", 59: "Bosnia and Herzegovina", 60: "Croatia", 61: "Kosovo", 62: "Macedonia", 63: "Montenegro", 64: "Serbia", 65: "Slovenia", 66: "Albania", 67: "Greece", 68: "Russia", 69: "Belarus", 70: "Moldova", 71: "Ukraine", 72: "Armenia", 73: "Azerbaijan", 74: "Georgia", 75: "Kazakhstan/Uzbekistan,etc.", 76: "Turkey", 77: "Arabia/Persia", 78: "Israel", 79: "Chine", 80: "India/Sri Lanka", 81: "Japan", 82: "Korea", 83: "Vietnam", 84: "others" }
 selected_countries = ["Germany", "U.S.A.", "France", "Spain", "Austria", "Swiss"]

 def regex_filter(name):
 	if REGEX_COMPILED.match(name) is None:
 		return False
 	return True

 def pre_filter(input, output, select_weight=2, gender_filter="M"):
 	with open(input, "r") as f:
 		with open(output, "w") as fo:
 			line = f.readline()
 			while line:
 				line = f.readline()
 				if line.startswith("#"):
 					continue
 				if line.startswith(gender_filter) or line.startswith("?"+gender_filter) or line.startswith("? "):
 					continue
 				pro_cnt = 0
 				counter_cnt = 0
 				name = line[3:27].strip()
 				if "+" in name:
 					continue
 				if not regex_filter(name):
 					continue
 				for column in range(30, 30 + len(country_column.keys())):
 					if line[column] in [" ", "+", "-"]:
 						continue
 					val = int(line[column], 16)
 					if country_column[column] in selected_countries:
 						pro_cnt += val
 					else:
 						counter_cnt += val
 				if  select_weight * counter_cnt > pro_cnt:
 					continue
 				fo.write("%s\n" % (name))


 def uniquify(input, output):
 	with open(input, "r") as f:
 		with open(output, "w") as fo:
 			hashi = {}
 			name = f.readline().strip()
 			while name:
 				try:
 					name = f.readline().strip()
 					hashi[name] = 1
 				except:
 					raise
 				name = f.readline().strip()
 			for name in hashi.keys():
 				fo.write("%s\n" % (name))


 pre_filter(input_file_data_base, output_file_filtered, select_weight=1)
 uniquify(output_file_filtered, output_file)
	import re

	EAST_WEST_BORDER_COLUMN = 44
	REGEX_COMPILED = p = re.compile('([\w]n[\w]u[\w])\|[\w]u[\w]n[\w]')

	input_file_data_base = "nam_dict.txt"
	output_file_filtered = "name_database_filtered.txt"
	output_file = "possible_names_selection.txt"

	country_column = {30: "Great Britain", 31: "Ireland", 32: "U.S.A.", 33: "Italy", 34: "Malta", 35: "Portugal", 36: "Spain", 37: "France", 38: "Belgium", 39: "Luxembourg", 40: "the Netherlands", 41: "East Frisia", 42: "Germany", 43: "Austria", 44: "Swiss", 45: "Iceland", 46: "Denmark", 47: "Norway", 48: "Sweden", 49: "Finland", 50: "Estonia", 51: "Latvia", 52: "Lithuania", 53: "Poland", 54: "Czech Republic", 55: "Slovakia", 56: "Hungary", 57: "Romania", 58: "Bulgaria", 59: "Bosnia and Herzegovina", 60: "Croatia", 61: "Kosovo", 62: "Macedonia", 63: "Montenegro", 64: "Serbia", 65: "Slovenia", 66: "Albania", 67: "Greece", 68: "Russia", 69: "Belarus", 70: "Moldova", 71: "Ukraine", 72: "Armenia", 73: "Azerbaijan", 74: "Georgia", 75: "Kazakhstan/Uzbekistan,etc.", 76: "Turkey", 77: "Arabia/Persia", 78: "Israel", 79: "Chine", 80: "India/Sri Lanka", 81: "Japan", 82: "Korea", 83: "Vietnam", 84: "others" }
	selected_countries = ["Germany", "U.S.A.", "France", "Spain", "Austria", "Swiss"]

	def regex_filter(name):
	if REGEX_COMPILED.match(name) is None:
	return False
	return True

	def pre_filter(input, output, select_weight=2, gender_filter="M"):
	with open(input, "r") as f:
	with open(output, "w") as fo:
	line = f.readline()
	while line:
	line = f.readline()
	if line.startswith("#"):
	continue
	if line.startswith(gender_filter) or line.startswith("?"+gender_filter) or line.startswith("? "):
	continue
	pro_cnt = 0
	counter_cnt = 0
	name = line[3:27].strip()
	if "+" in name:
	continue
	if not regex_filter(name):
	continue
	for column in range(30, 30 + len(country_column.keys())):
	if line[column] in [" ", "+", "-"]:
	continue
	val = int(line[column], 16)
	if country_column[column] in selected_countries:
	pro_cnt += val
	else:
	counter_cnt += val
	if select_weight * counter_cnt > pro_cnt:
	continue
	fo.write("%s\n" % (name))


	def uniquify(input, output):
	with open(input, "r") as f:
	with open(output, "w") as fo:
	hashi = {}
	name = f.readline().strip()
	while name:
	try:
	name = f.readline().strip()
	hashi[name] = 1
	except:
	raise
	name = f.readline().strip()
	for name in hashi.keys():
	fo.write("%s\n" % (name))


	pre_filter(input_file_data_base, output_file_filtered, select_weight=1)
	uniquify(output_file_filtered, output_file)