Created
May 9, 2024 09:47
-
-
Save myui/2d5c9a4b81d9fed879f5e0ba5fcd3d8f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
from faker import Faker | |
import datetime | |
import uuid | |
import pandas as pd | |
from tqdm import tqdm | |
def randint_gauss(start, end): | |
""" | |
Generate integers within a specific range using a standard distribution. | |
Args: | |
start (int): The starting value of the range (inclusive). | |
end (int): The ending value of the range (exclusive). | |
num_samples (int): The number of integers to generate. | |
Returns: | |
list: A list of integers generated within the specified range. | |
""" | |
mean = (start + end) / 2 | |
std_dev = (end - start) / 6 # Adjust the standard deviation based on the range | |
value = int(random.gauss(mean, std_dev)) | |
# Ensure the generated value is within the specified range | |
return max(start, min(value, end - 1)) | |
def generate_fake_data(): | |
write_mode = 'overwrite' | |
fake = Faker() | |
now = datetime.datetime.now() | |
users = [] | |
transactions = [] | |
for i in tqdm(range(NUM_USERS)): | |
user_id = uuid.uuid4() | |
_, name, sex, address, mail, birthdate = fake.simple_profile().values() | |
birthdate = birthdate.isoformat() | |
user_name = fake.name() | |
num_txn = randint_gauss(*NUM_TXN_PER_USER) | |
for _ in range(num_txn): | |
category = random.choice(PRODUCT_CATEGORIES) | |
tstamp = fake.past_datetime(start_date='-9y') | |
amount = random.randint(*AMOUNT_RANGE) | |
transactions.append((i, user_id, tstamp, category, amount)) | |
users.append((i, user_id, name, sex, address, mail, birthdate, num_txn)) | |
if i > 0 and i % 500_000 == 0: | |
master_df = pd.DataFrame(users, columns=['user_seq', 'user_id', 'name', 'sex', 'address', 'mail', 'birthdate', 'num_txn']) | |
client.load_table_from_dataframe(master_df, 'myui.master_profile', writer='bulk_import', if_exists=write_mode, fmt='msgpack') | |
users = [] | |
transaction_df = pd.DataFrame(transactions, columns=['user_seq', 'user_id', 'tstamp', 'category', 'amount']).sample(frac=1).sort_values(by=['tstamp']).reset_index(drop=True) | |
client.load_table_from_dataframe(transaction_df, 'myui.transactions', writer='bulk_import', if_exists=write_mode, fmt='msgpack') | |
transactions = [] | |
write_mode = 'append' | |
if len(users) > 0: | |
master_df = pd.DataFrame(users, columns=['user_seq', 'user_id', 'name', 'sex', 'address', 'mail', 'birthdate', 'num_txn']) | |
client.load_table_from_dataframe(master_df, 'myui.master_profile', writer='bulk_import', if_exists=write_mode, fmt='msgpack') | |
if len(transactions) > 0: | |
transaction_df = pd.DataFrame(transactions, columns=['user_seq', 'user_id', 'tstamp', 'category', 'amount']).sample(frac=1).sort_values(by=['tstamp']).reset_index(drop=True) | |
client.load_table_from_dataframe(transaction_df, 'myui.transactions', writer='bulk_import', if_exists=write_mode, fmt='msgpack') | |
return users, transactions | |
import os | |
os.environ["TD_API_KEY"] = td_apikey | |
os.environ["TD_API_SERVER"] = td_endpoint | |
os.environ["TD_PRESTO_API"] = td_presto_endpoint | |
import pytd | |
client = pytd.Client(database='myui', retry_post_requests=True) | |
NUM_USERS = 200_000_000 # 100x transactions are generated on average | |
#NUM_USERS = 10_000 # 100x transactions are generated on average | |
NUM_TXN_PER_USER = (1, 200) | |
AMOUNT_RANGE = (1, 10000) | |
PRODUCT_CATEGORIES = [ | |
"Books", | |
"Movies", | |
"Music", | |
"Games", | |
"Electronics", | |
"Computers", | |
"Home", | |
"Garden", | |
"Tools", | |
"Grocery", | |
"Health", | |
"Beauty", | |
"Toys", | |
"Kids", | |
"Baby", | |
"Clothing", | |
"Shoes", | |
"Jewelery", | |
"Sports", | |
"Outdoors", | |
"Automotive", | |
"Industrial" | |
] | |
generate_fake_data() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment