Skip to content

Instantly share code, notes, and snippets.

@vijayanandrp
Last active November 30, 2017 14:28
Show Gist options
  • Save vijayanandrp/8f0d61d9e8b7ffc4df495dc6881b2f54 to your computer and use it in GitHub Desktop.
Save vijayanandrp/8f0d61d9e8b7ffc4df495dc6881b2f54 to your computer and use it in GitHub Desktop.

You can download the dataset at here

!/usr/bin/env python3.5
# -*- coding: utf-8 -*-

import os
import random
from zipfile import ZipFile
from nltk import NaiveBayesClassifier, MaxentClassifier, DecisionTreeClassifier, classify

Names = (Unique_Name, Male_frequency_count, Female_frequency_count)

def load_names(zip_file='data_set/names.zip'):
    if not os.path.isfile(zip_file):
        print('names.zip is missing.')
        exit(-1)

    names = dict()
    gender_map = {'M': 0, 'F': 1}

    unzip = ZipFile(zip_file, 'r')
    files = unzip.namelist()

    for file in files:
        file = unzip.open(file, 'r').read().decode('utf-8')
        rows = [row.strip().split(',') for row in file.split('\n') if len(row) > 1]
        for row in rows:
            if not len(row) == 3:
                continue
            name = row[0].upper()
            gender = gender_map[row[1].upper()]
            count = int(row[2])
            # adding frequency in names dict based on gender
            if name not in names:
                names[name] = [0, 0]
            names[name][gender] += count
    return names

Converting into tuple (name, male_freq_count, female_freq_count)

def split_names(names: dict()):
    if not names:
        print('names dict is none.')
        exit(-1)

    male_names = list()
    female_names = list()

    for name in names.keys():
        counts = names[name]
        # converting into tuple (name, male_freq_count, female_freq_count)
        male_counts, female_counts = counts[0], counts[1]
        data = (name, male_counts, female_counts)

        if male_counts == female_counts:
            continue

        if male_counts > female_counts:
            male_names.append(data)
        else:
            female_names.append(data)

    names = (male_names, female_names)

    total_males_names = len(male_names)
    total_females_names = len(female_names)
    total_names = total_females_names + total_males_names
    print('Dataset Overview.\n Total names - {} \n Total males names - '
          '{} \n Total female names - {}'.format(total_names, total_males_names,
                                                 total_females_names))
    return names

Sample data format on how the final dataset will be prepared.

    'ELMINA': [0, 1385], 'ALEARA': [0, 17], 'BEMNET': [31, 42], 'YISSELL': [0, 30],
    'IZAIYAH': [271, 0], 'BREELLA': [0, 66], 'VANDON': [79, 0], 'BRIAUNA': [0, 2585],
    'NAIDELIN': [0, 430], 'AMIS': [10, 0], 'NINNA': [0, 69], 'ILETTA': [0, 10],
    'KHRYSTEN': [0, 58], 'LORINA': [0, 3482], 'RINALDO': [1749, 0], 'DALAJAH': [0, 12],
    'DEVREN': [44, 0], 'JIAHAO': [12, 0], 'KAITLIN': [150, 112330], 'DEVRON': [1680, 0],
    'OWYN': [720, 15], 'ADJI': [0, 10], 'CHURCH': [10, 0], 'MISSI': [0, 444], 'KUSHANA': [0, 46],
    'LAYTEN': [334, 22], 'JEANMARC': [317, 0], 'MONTERRIUS': [152, 0], 'SAMHITA': [0, 552],
    'NATACHA': [0, 2709], 'KHALISAH': [0, 6], 'HARLEQUINN': [0, 10]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment