paul-english · December 21, 2016 18:30
diff --git a/fake_people.py b/fake_people.py
 #!/usr/bin/env python

 import random
 import string

 import numpy as np
 import pandas as pd
 from faker import Faker
 from tqdm import tqdm

 fake = Faker()

 # Settings
 n = int(1e3) # because we do big data, right?
 n_passes = 10
 percent_to_duplicate_on_pass = 0.15

 # Params for the beta dist that decides how many columns to permute
 cols_a = 2
 cols_b = 6

 # Param for the poisson that decides how much to permute a string
 permutes_lambda = 3

 # TODO might want to choose the above params based on a distribution in the loops,
 # selected by hyperparameters, e.g sometimes lambda should be high, sometimes beta should be different

 def fake_record():
    return {
        'first_name': fake.first_name(),
        'last_name': fake.last_name(),
        'address': fake.address(),
        'company': fake.company(),
        'date': str(fake.date_time()),
        'phone_number': fake.phone_number(),
        'job': fake.job(),
        'ssn': fake.ssn(),
    }

 def permute_string(s):
    """Pick a random index, and replace that index with a random character"""
    idx = random.randint(0, len(s)-1)
    char = random.choice(string.letters + string.digits)
    arr = list(s)
    arr[idx] = char
    return "".join(arr)

 def permute_row(row):
    n_columns_to_permute = int(
        np.random.beta(cols_a, cols_b) * (len(row)-1)
    ) + 1
    cols = np.random.choice(len(row), n_columns_to_permute)

    for col in cols:
        if row.index[col] == 'id':
            continue
        val = row[col]
        orig_val = val
        n_permutations = np.random.poisson(permutes_lambda)
        for i in range(n_permutations):
            val = permute_string(val)
        row[col] = val
    return row

 def main():
    print('Generating the initial fake records')
    fakes = [fake_record() for i in tqdm(list(range(n)))]
    df = pd.DataFrame(fakes)

    duplicates = []
    df['id'] = df.index

    print('Duplication steps')
    for i in tqdm(list(range(n_passes))):
        sample = df.sample(frac=percent_to_duplicate_on_pass)
        ids = sample['id']

        sample = sample.apply(permute_row, axis=1)
        sample['is_dupe'] = True
        sample['id'] = ids # restore ids if they were changed
        duplicates.append(sample)

        # permute random amount of these columns following distribution beta*len_columns
        # permutations should just be random string permutations, n-times with n sampled from poisson

    df['is_dupe'] = False

    duped = pd.concat([df] + duplicates)
    duped = duped.sample(frac=1)

    args = (n, n_passes, percent_to_duplicate_on_pass)
    df.to_csv('%s_people_clean_apply_%s_percent_%s.csv' % args, index=False)
    duped.to_csv('%s_people_duped_apply_%s_percent_%s.csv' % args, index=False)
    duped.drop(['id', 'is_dupe'], axis=1).to_csv('%s_people_duped_unlabeled_apply_%s_percent_%s.csv' % args, index=False)

 if __name__ == '__main__':
    main()
	#!/usr/bin/env python

	import random
	import string

	import numpy as np
	import pandas as pd
	from faker import Faker
	from tqdm import tqdm

	fake = Faker()

	# Settings
	n = int(1e3) # because we do big data, right?
	n_passes = 10
	percent_to_duplicate_on_pass = 0.15

	# Params for the beta dist that decides how many columns to permute
	cols_a = 2
	cols_b = 6

	# Param for the poisson that decides how much to permute a string
	permutes_lambda = 3

	# TODO might want to choose the above params based on a distribution in the loops,
	# selected by hyperparameters, e.g sometimes lambda should be high, sometimes beta should be different

	def fake_record():
	return {
	'first_name': fake.first_name(),
	'last_name': fake.last_name(),
	'address': fake.address(),
	'company': fake.company(),
	'date': str(fake.date_time()),
	'phone_number': fake.phone_number(),
	'job': fake.job(),
	'ssn': fake.ssn(),
	}

	def permute_string(s):
	"""Pick a random index, and replace that index with a random character"""
	idx = random.randint(0, len(s)-1)
	char = random.choice(string.letters + string.digits)
	arr = list(s)
	arr[idx] = char
	return "".join(arr)

	def permute_row(row):
	n_columns_to_permute = int(
	np.random.beta(cols_a, cols_b) * (len(row)-1)
	) + 1
	cols = np.random.choice(len(row), n_columns_to_permute)

	for col in cols:
	if row.index[col] == 'id':
	continue
	val = row[col]
	orig_val = val
	n_permutations = np.random.poisson(permutes_lambda)
	for i in range(n_permutations):
	val = permute_string(val)
	row[col] = val
	return row

	def main():
	print('Generating the initial fake records')
	fakes = [fake_record() for i in tqdm(list(range(n)))]
	df = pd.DataFrame(fakes)

	duplicates = []
	df['id'] = df.index

	print('Duplication steps')
	for i in tqdm(list(range(n_passes))):
	sample = df.sample(frac=percent_to_duplicate_on_pass)
	ids = sample['id']

	sample = sample.apply(permute_row, axis=1)
	sample['is_dupe'] = True
	sample['id'] = ids # restore ids if they were changed
	duplicates.append(sample)

	# permute random amount of these columns following distribution beta*len_columns
	# permutations should just be random string permutations, n-times with n sampled from poisson

	df['is_dupe'] = False

	duped = pd.concat([df] + duplicates)
	duped = duped.sample(frac=1)

	args = (n, n_passes, percent_to_duplicate_on_pass)
	df.to_csv('%s_people_clean_apply_%s_percent_%s.csv' % args, index=False)
	duped.to_csv('%s_people_duped_apply_%s_percent_%s.csv' % args, index=False)
	duped.drop(['id', 'is_dupe'], axis=1).to_csv('%s_people_duped_unlabeled_apply_%s_percent_%s.csv' % args, index=False)

	if __name__ == '__main__':
	main()