Last active
June 16, 2025 17:34
-
-
Save valarpirai/b6b3c63043a320472f39bd310b4f39a7 to your computer and use it in GitHub Desktop.
Msyql table 10 Million transactions exercise
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
from datetime import datetime, timedelta | |
from sqlalchemy import create_engine, Table, Column, BigInteger, DateTime, Numeric, MetaData | |
from sqlalchemy.dialects.mysql import DECIMAL | |
import tqdm | |
# Database connection (replace with your credentials) | |
DB_URL = "mysql+pymysql://root:root@localhost:3306/rambo" | |
engine = create_engine(DB_URL) | |
# Define table metadata | |
metadata = MetaData() | |
transactions = Table( | |
'transactions', metadata, | |
Column('transaction_id', BigInteger, primary_key=True), | |
Column('user_id', BigInteger, nullable=False), | |
Column('transaction_timestamp', DateTime, nullable=False), | |
Column('amount', DECIMAL(10, 2), nullable=False) | |
) | |
# Parameters | |
NUM_ROWS = 10_000_000 | |
NUM_USERS = 10_000 | |
BATCH_SIZE = 10_000 | |
START_DATE = datetime(2024, 6, 16) | |
END_DATE = datetime(2025, 6, 16) | |
def generate_transaction(i): | |
user_id = random.randint(1, NUM_USERS) | |
delta = END_DATE - START_DATE | |
random_seconds = random.randint(0, int(delta.total_seconds())) | |
timestamp = START_DATE + timedelta(seconds=random_seconds) | |
amount = round(random.uniform(1.0, 1000.0), 2) | |
return { | |
'transaction_id': i + 1, | |
'user_id': user_id, | |
'transaction_timestamp': timestamp, | |
'amount': amount | |
} | |
def insert_data(): | |
try: | |
with engine.connect() as connection: | |
print(f"Inserting {NUM_ROWS:,} rows in batches of {BATCH_SIZE:,}...") | |
for start in tqdm.tqdm(range(0, NUM_ROWS, BATCH_SIZE), desc="Progress"): | |
batch = [generate_transaction(i) for i in range(start, min(start + BATCH_SIZE, NUM_ROWS))] | |
connection.execute(transactions.insert(), batch) | |
connection.commit() | |
print("Data insertion completed successfully!") | |
except Exception as e: | |
print(f"Error occurred: {e}") | |
if __name__ == "__main__": | |
insert_data() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE TABLE transactions ( | |
transaction_id BIGINT PRIMARY KEY, | |
user_id BIGINT NOT NULL, | |
transaction_timestamp DATETIME NOT NULL, | |
amount DECIMAL(10, 2) NOT NULL, | |
INDEX idx_user_timestamp (user_id, transaction_timestamp) | |
); | |
# Get the latest transaction per user from a 10M row table | |
with ranked as ( | |
SELECT t.*, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY transaction_timestamp DESC) as rn | |
FROM transactions t | |
) | |
SELECT * from ranked | |
WHERE rn = 1; | |
# Now get the 2nd latest per user. | |
with ranked as ( | |
SELECT t.*, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY transaction_timestamp DESC) as rn | |
FROM transactions t | |
) | |
SELECT * from ranked | |
WHERE rn = 2; | |
# Now do it efficiently — avoid subqueries if you can. | |
# What’s your strategy for NULLs in joins and filters? |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment