Skip to content

Instantly share code, notes, and snippets.

@valarpirai
Last active June 16, 2025 17:34
Show Gist options
  • Save valarpirai/b6b3c63043a320472f39bd310b4f39a7 to your computer and use it in GitHub Desktop.
Save valarpirai/b6b3c63043a320472f39bd310b4f39a7 to your computer and use it in GitHub Desktop.
Msyql table 10 Million transactions exercise
import random
from datetime import datetime, timedelta
from sqlalchemy import create_engine, Table, Column, BigInteger, DateTime, Numeric, MetaData
from sqlalchemy.dialects.mysql import DECIMAL
import tqdm
# Database connection (replace with your credentials)
DB_URL = "mysql+pymysql://root:root@localhost:3306/rambo"
engine = create_engine(DB_URL)
# Define table metadata
metadata = MetaData()
transactions = Table(
'transactions', metadata,
Column('transaction_id', BigInteger, primary_key=True),
Column('user_id', BigInteger, nullable=False),
Column('transaction_timestamp', DateTime, nullable=False),
Column('amount', DECIMAL(10, 2), nullable=False)
)
# Parameters
NUM_ROWS = 10_000_000
NUM_USERS = 10_000
BATCH_SIZE = 10_000
START_DATE = datetime(2024, 6, 16)
END_DATE = datetime(2025, 6, 16)
def generate_transaction(i):
user_id = random.randint(1, NUM_USERS)
delta = END_DATE - START_DATE
random_seconds = random.randint(0, int(delta.total_seconds()))
timestamp = START_DATE + timedelta(seconds=random_seconds)
amount = round(random.uniform(1.0, 1000.0), 2)
return {
'transaction_id': i + 1,
'user_id': user_id,
'transaction_timestamp': timestamp,
'amount': amount
}
def insert_data():
try:
with engine.connect() as connection:
print(f"Inserting {NUM_ROWS:,} rows in batches of {BATCH_SIZE:,}...")
for start in tqdm.tqdm(range(0, NUM_ROWS, BATCH_SIZE), desc="Progress"):
batch = [generate_transaction(i) for i in range(start, min(start + BATCH_SIZE, NUM_ROWS))]
connection.execute(transactions.insert(), batch)
connection.commit()
print("Data insertion completed successfully!")
except Exception as e:
print(f"Error occurred: {e}")
if __name__ == "__main__":
insert_data()
CREATE TABLE transactions (
transaction_id BIGINT PRIMARY KEY,
user_id BIGINT NOT NULL,
transaction_timestamp DATETIME NOT NULL,
amount DECIMAL(10, 2) NOT NULL,
INDEX idx_user_timestamp (user_id, transaction_timestamp)
);
# Get the latest transaction per user from a 10M row table
with ranked as (
SELECT t.*, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY transaction_timestamp DESC) as rn
FROM transactions t
)
SELECT * from ranked
WHERE rn = 1;
# Now get the 2nd latest per user.
with ranked as (
SELECT t.*, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY transaction_timestamp DESC) as rn
FROM transactions t
)
SELECT * from ranked
WHERE rn = 2;
# Now do it efficiently — avoid subqueries if you can.
# What’s your strategy for NULLs in joins and filters?
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment