Skip to content

Instantly share code, notes, and snippets.

@paul-english
Created December 21, 2016 18:30
Show Gist options
  • Save paul-english/061f3984f598522ca27168227aa14820 to your computer and use it in GitHub Desktop.
Save paul-english/061f3984f598522ca27168227aa14820 to your computer and use it in GitHub Desktop.
fake_people.py
#!/usr/bin/env python
import random
import string
import numpy as np
import pandas as pd
from faker import Faker
from tqdm import tqdm
fake = Faker()
# Settings
n = int(1e3) # because we do big data, right?
n_passes = 10
percent_to_duplicate_on_pass = 0.15
# Params for the beta dist that decides how many columns to permute
cols_a = 2
cols_b = 6
# Param for the poisson that decides how much to permute a string
permutes_lambda = 3
# TODO might want to choose the above params based on a distribution in the loops,
# selected by hyperparameters, e.g sometimes lambda should be high, sometimes beta should be different
def fake_record():
return {
'first_name': fake.first_name(),
'last_name': fake.last_name(),
'address': fake.address(),
'company': fake.company(),
'date': str(fake.date_time()),
'phone_number': fake.phone_number(),
'job': fake.job(),
'ssn': fake.ssn(),
}
def permute_string(s):
"""Pick a random index, and replace that index with a random character"""
idx = random.randint(0, len(s)-1)
char = random.choice(string.letters + string.digits)
arr = list(s)
arr[idx] = char
return "".join(arr)
def permute_row(row):
n_columns_to_permute = int(
np.random.beta(cols_a, cols_b) * (len(row)-1)
) + 1
cols = np.random.choice(len(row), n_columns_to_permute)
for col in cols:
if row.index[col] == 'id':
continue
val = row[col]
orig_val = val
n_permutations = np.random.poisson(permutes_lambda)
for i in range(n_permutations):
val = permute_string(val)
row[col] = val
return row
def main():
print('Generating the initial fake records')
fakes = [fake_record() for i in tqdm(list(range(n)))]
df = pd.DataFrame(fakes)
duplicates = []
df['id'] = df.index
print('Duplication steps')
for i in tqdm(list(range(n_passes))):
sample = df.sample(frac=percent_to_duplicate_on_pass)
ids = sample['id']
sample = sample.apply(permute_row, axis=1)
sample['is_dupe'] = True
sample['id'] = ids # restore ids if they were changed
duplicates.append(sample)
# permute random amount of these columns following distribution beta*len_columns
# permutations should just be random string permutations, n-times with n sampled from poisson
df['is_dupe'] = False
duped = pd.concat([df] + duplicates)
duped = duped.sample(frac=1)
args = (n, n_passes, percent_to_duplicate_on_pass)
df.to_csv('%s_people_clean_apply_%s_percent_%s.csv' % args, index=False)
duped.to_csv('%s_people_duped_apply_%s_percent_%s.csv' % args, index=False)
duped.drop(['id', 'is_dupe'], axis=1).to_csv('%s_people_duped_unlabeled_apply_%s_percent_%s.csv' % args, index=False)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment