Created
December 21, 2016 18:30
-
-
Save paul-english/061f3984f598522ca27168227aa14820 to your computer and use it in GitHub Desktop.
fake_people.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import random | |
import string | |
import numpy as np | |
import pandas as pd | |
from faker import Faker | |
from tqdm import tqdm | |
fake = Faker() | |
# Settings | |
n = int(1e3) # because we do big data, right? | |
n_passes = 10 | |
percent_to_duplicate_on_pass = 0.15 | |
# Params for the beta dist that decides how many columns to permute | |
cols_a = 2 | |
cols_b = 6 | |
# Param for the poisson that decides how much to permute a string | |
permutes_lambda = 3 | |
# TODO might want to choose the above params based on a distribution in the loops, | |
# selected by hyperparameters, e.g sometimes lambda should be high, sometimes beta should be different | |
def fake_record(): | |
return { | |
'first_name': fake.first_name(), | |
'last_name': fake.last_name(), | |
'address': fake.address(), | |
'company': fake.company(), | |
'date': str(fake.date_time()), | |
'phone_number': fake.phone_number(), | |
'job': fake.job(), | |
'ssn': fake.ssn(), | |
} | |
def permute_string(s): | |
"""Pick a random index, and replace that index with a random character""" | |
idx = random.randint(0, len(s)-1) | |
char = random.choice(string.letters + string.digits) | |
arr = list(s) | |
arr[idx] = char | |
return "".join(arr) | |
def permute_row(row): | |
n_columns_to_permute = int( | |
np.random.beta(cols_a, cols_b) * (len(row)-1) | |
) + 1 | |
cols = np.random.choice(len(row), n_columns_to_permute) | |
for col in cols: | |
if row.index[col] == 'id': | |
continue | |
val = row[col] | |
orig_val = val | |
n_permutations = np.random.poisson(permutes_lambda) | |
for i in range(n_permutations): | |
val = permute_string(val) | |
row[col] = val | |
return row | |
def main(): | |
print('Generating the initial fake records') | |
fakes = [fake_record() for i in tqdm(list(range(n)))] | |
df = pd.DataFrame(fakes) | |
duplicates = [] | |
df['id'] = df.index | |
print('Duplication steps') | |
for i in tqdm(list(range(n_passes))): | |
sample = df.sample(frac=percent_to_duplicate_on_pass) | |
ids = sample['id'] | |
sample = sample.apply(permute_row, axis=1) | |
sample['is_dupe'] = True | |
sample['id'] = ids # restore ids if they were changed | |
duplicates.append(sample) | |
# permute random amount of these columns following distribution beta*len_columns | |
# permutations should just be random string permutations, n-times with n sampled from poisson | |
df['is_dupe'] = False | |
duped = pd.concat([df] + duplicates) | |
duped = duped.sample(frac=1) | |
args = (n, n_passes, percent_to_duplicate_on_pass) | |
df.to_csv('%s_people_clean_apply_%s_percent_%s.csv' % args, index=False) | |
duped.to_csv('%s_people_duped_apply_%s_percent_%s.csv' % args, index=False) | |
duped.drop(['id', 'is_dupe'], axis=1).to_csv('%s_people_duped_unlabeled_apply_%s_percent_%s.csv' % args, index=False) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment