You can download the dataset at here
!/usr/bin/env python3.5
# -*- coding: utf-8 -*-
import os
import random
from zipfile import ZipFile
from nltk import NaiveBayesClassifier, MaxentClassifier, DecisionTreeClassifier, classify
Names = (Unique_Name, Male_frequency_count, Female_frequency_count)
def load_names(zip_file='data_set/names.zip'):
if not os.path.isfile(zip_file):
print('names.zip is missing.')
exit(-1)
names = dict()
gender_map = {'M': 0, 'F': 1}
unzip = ZipFile(zip_file, 'r')
files = unzip.namelist()
for file in files:
file = unzip.open(file, 'r').read().decode('utf-8')
rows = [row.strip().split(',') for row in file.split('\n') if len(row) > 1]
for row in rows:
if not len(row) == 3:
continue
name = row[0].upper()
gender = gender_map[row[1].upper()]
count = int(row[2])
# adding frequency in names dict based on gender
if name not in names:
names[name] = [0, 0]
names[name][gender] += count
return names
Converting into tuple (name, male_freq_count, female_freq_count)
def split_names(names: dict()):
if not names:
print('names dict is none.')
exit(-1)
male_names = list()
female_names = list()
for name in names.keys():
counts = names[name]
# converting into tuple (name, male_freq_count, female_freq_count)
male_counts, female_counts = counts[0], counts[1]
data = (name, male_counts, female_counts)
if male_counts == female_counts:
continue
if male_counts > female_counts:
male_names.append(data)
else:
female_names.append(data)
names = (male_names, female_names)
total_males_names = len(male_names)
total_females_names = len(female_names)
total_names = total_females_names + total_males_names
print('Dataset Overview.\n Total names - {} \n Total males names - '
'{} \n Total female names - {}'.format(total_names, total_males_names,
total_females_names))
return names
Sample data format on how the final dataset will be prepared.
'ELMINA': [0, 1385], 'ALEARA': [0, 17], 'BEMNET': [31, 42], 'YISSELL': [0, 30],
'IZAIYAH': [271, 0], 'BREELLA': [0, 66], 'VANDON': [79, 0], 'BRIAUNA': [0, 2585],
'NAIDELIN': [0, 430], 'AMIS': [10, 0], 'NINNA': [0, 69], 'ILETTA': [0, 10],
'KHRYSTEN': [0, 58], 'LORINA': [0, 3482], 'RINALDO': [1749, 0], 'DALAJAH': [0, 12],
'DEVREN': [44, 0], 'JIAHAO': [12, 0], 'KAITLIN': [150, 112330], 'DEVRON': [1680, 0],
'OWYN': [720, 15], 'ADJI': [0, 10], 'CHURCH': [10, 0], 'MISSI': [0, 444], 'KUSHANA': [0, 46],
'LAYTEN': [334, 22], 'JEANMARC': [317, 0], 'MONTERRIUS': [152, 0], 'SAMHITA': [0, 552],
'NATACHA': [0, 2709], 'KHALISAH': [0, 6], 'HARLEQUINN': [0, 10]