Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save abhijeet-talaulikar/60c612ffd381b559d046a57fe05ca829 to your computer and use it in GitHub Desktop.
Save abhijeet-talaulikar/60c612ffd381b559d046a57fe05ca829 to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
import csv
import random
import string
### Create full dataset of 5 million credit card customers ###
def generate_cust_id():
return ''.join(random.choices(string.ascii_uppercase + string.digits, k=7))
def generate_age():
return random.randint(25, 35)
def generate_num_credit_lines():
return random.randint(1, 3)
def generate_income_tier():
return random.randint(1, 3)
def generate_spend_grocery():
return round(random.uniform(0.1, 0.6), 2)
def generate_spend_gas():
return round(random.uniform(0.1, 0.6), 2)
def generate_spend_dineout():
return round(random.uniform(0.1, 0.6), 2)
def generate_spend_travel():
return round(random.uniform(0, 0.2), 2)
def generate_balance():
return round(random.uniform(500, 5000), 2)
def generate_balance_frequency():
return round(random.uniform(0.2, 0.8), 2)
def generate_avg_trn_value():
return random.randint(10, 30)
def generate_purchases():
return round(random.uniform(1, 100), 2)
def generate_oneoff_purchases():
return round(random.uniform(0, 1), 2)
def generate_installments_purchases(oneoff_purchases):
return round(1 - oneoff_purchases, 2)
def generate_cash_advances():
return random.randint(0, 5)
def generate_forex_trn():
return random.randint(0, 10)
def calculate_credit_limit(balance, purchases, avg_trn_value):
return balance + purchases * avg_trn_value
def generate_payments():
return random.randint(6, 12)
def generate_prcfullpayment():
return round(random.uniform(0.5, 1), 2)
with open('synthetic_credit_card_dataset.csv', mode='w', newline='') as file:
writer = csv.writer(file)
writer.writerow(['CUST_ID', 'AGE', 'NUM_CREDIT_LINES', 'INCOME_TIER', 'SPEND_GROCERY', 'SPEND_GAS', 'SPEND_DINEOUT', 'SPEND_TRAVEL', 'BALANCE', 'BALANCE_FREQUENCY', 'AVG_TRN_VALUE', 'PURCHASES', 'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCES', 'FOREX_TRN', 'CREDIT_LIMIT', 'PAYMENTS', 'PRCFULLPAYMENT', 'TENURE'])
for i in range(5000000):
oneoff_purchases = generate_oneoff_purchases()
balance = generate_balance()
balance_frequency = generate_balance_frequency()
avg_trn_value = generate_avg_trn_value()
spend_grocery = generate_spend_grocery()
spend_gas = generate_spend_gas()
spend_dineout = generate_spend_dineout()
spend_travel = generate_spend_travel()
spend_grocery /= (spend_grocery+spend_gas+spend_dineout+spend_travel)
spend_gas /= (spend_grocery+spend_gas+spend_dineout+spend_travel)
spend_dineout /= (spend_grocery+spend_gas+spend_dineout+spend_travel)
spend_travel /= (spend_grocery+spend_gas+spend_dineout+spend_travel)
writer.writerow([generate_cust_id(),
generate_age(),
generate_num_credit_lines(),
generate_income_tier(),
spend_grocery,
spend_gas,
spend_dineout,
spend_travel,
balance,
balance_frequency,
avg_trn_value,
generate_purchases(),
oneoff_purchases,
generate_installments_purchases(oneoff_purchases),
generate_cash_advances(),
generate_forex_trn(),
calculate_credit_limit(balance,
balance_frequency,
avg_trn_value),
generate_payments(),
generate_prcfullpayment(),
10])
### Create pilot dataset using 1% of the customers ###
data = pd.read_csv('synthetic_credit_card_dataset.csv')
pilot = data.sample(frac=0.01, random_state=1)
pilot['Treated'] = np.random.choice([0,1], pilot.shape[0])
pilot['Treatment'] = pd.cut(pilot['PURCHASES'] * pilot['AVG_TRN_VALUE'], 3, labels=["Annual Fee x 30%","Annual Fee x 50%","Annual Fee x 70%"])
pilot['Treatment'] = np.where(pilot['Treated'], pilot['Treatment'], "Full Annual Fee")
pilot['Treated'] = np.random.choice([0,1], pilot.shape[0])
pilot['Renewed'] = np.where(
pilot['Treated'],
np.where(pilot['Treatment']=="Annual Fee x 30%", np.random.binomial(1, 0.7, pilot.shape[0]),
np.where(pilot['Treatment']=="Annual Fee x 50%", np.random.binomial(1, 0.7, pilot.shape[0]),
np.random.binomial(1, 0.7, pilot.shape[0]))),
np.random.binomial(1, 0.4, pilot.shape[0])
)
pilot.to_csv('pilot_credit_card.csv', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment