Skip to content

Instantly share code, notes, and snippets.

@drorata
Created June 26, 2025 09:09
Show Gist options
  • Save drorata/da8e11ac6bc07fa04f2fccdcc8d80881 to your computer and use it in GitHub Desktop.
Save drorata/da8e11ac6bc07fa04f2fccdcc8d80881 to your computer and use it in GitHub Desktop.
Compare Pandas and SQL code for the same purpose
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "b9a3c479",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from datetime import datetime\n",
"import duckdb"
]
},
{
"cell_type": "markdown",
"id": "f6130ca1",
"metadata": {},
"source": [
"This notebook tests and compares two implementations for finding the latest valid assessment per reservation:\n",
"\n",
"1. **SQL Implementation**: Uses DuckDB with a CTE (Common Table Expression) to rank assessments and apply coalesce logic\n",
"2. **Pandas Implementation**: Uses DataFrame operations with merge, groupby, and backfill to achieve the same result\n",
"\n",
"## Test Logic\n",
"- For each reservation, find the **latest assessment** that occurs **before or at** the ActionTs timestamp\n",
"- If no valid assessment exists, use the **ActionTs** as the scoring timestamp (coalesce logic)\n",
"- Handle multiple assessments per reservation by selecting the most recent valid one\n",
"\n",
"## Test Cases\n",
"- Multiple assessments per reservation (latest valid selected)\n",
"- Single valid assessment\n",
"- Assessments after ActionTs (ignored)\n",
"- Assessment exactly at ActionTs (included)\n",
"- No assessments (ActionTs used as fallback)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "214d0912",
"metadata": {},
"outputs": [],
"source": [
"# Create test data directly as dataframes using arrays for better readability\n",
"\n",
"# Test contracts data\n",
"contracts = pd.DataFrame([\n",
" ['RES001', datetime(2024, 6, 10, 14, 30, 0)],\n",
" ['RES002', datetime(2024, 6, 12, 16, 45, 0)],\n",
" ['RES003', datetime(2024, 6, 13, 10, 15, 0)],\n",
" ['RES004', datetime(2024, 6, 14, 12, 0, 0)],\n",
" ['RES005', datetime(2024, 6, 15, 9, 30, 0)],\n",
" ['RES006', datetime(2024, 6, 16, 18, 0, 0)],\n",
" ['RES007', datetime(2024, 6, 17, 11, 30, 0)],\n",
" ['RES009', datetime(2024, 6, 19, 15, 45, 0)]\n",
"], columns=['ReservationNo', 'ActionTs'])\n",
"\n",
"# Test assessments data\n",
"assessments = pd.DataFrame([\n",
" # Multiple assessments for RES001\n",
" ['RES001', datetime(2024, 6, 8, 10, 0, 0)], # Valid early assessment\n",
" ['RES001', datetime(2024, 6, 9, 15, 30, 0)], # Valid later assessment\n",
" ['RES001', datetime(2024, 6, 10, 12, 0, 0)], # Valid latest assessment before ActionTs\n",
"\n",
" # Single assessment for RES002\n",
" ['RES002', datetime(2024, 6, 11, 8, 15, 0)], # Valid assessment\n",
"\n",
" # Assessment after ActionTs for RES003 (should be ignored)\n",
" ['RES003', datetime(2024, 6, 14, 18, 0, 0)], # Invalid - after ActionTs\n",
"\n",
" # Assessment exactly at ActionTs for RES004\n",
" ['RES004', datetime(2024, 6, 14, 12, 0, 0)], # Exactly at ActionTs time\n",
"\n",
" # Multiple assessments for RES006, one after ActionTs\n",
" ['RES006', datetime(2024, 6, 15, 9, 0, 0)], # Valid assessment\n",
" ['RES006', datetime(2024, 6, 16, 10, 30, 0)], # Valid latest assessment before ActionTs\n",
" ['RES006', datetime(2024, 6, 17, 20, 0, 0)], # Invalid - after ActionTs\n",
"\n",
" # Multiple assessments for RES009\n",
" ['RES009', datetime(2024, 6, 19, 15, 0, 0)], # Valid assessment\n",
" ['RES009', datetime(2024, 6, 19, 16, 0, 0)] # Invalid - after ActionTs\n",
"\n",
" # No assessments for RES005, RES007 - will test coalesce logic\n",
"], columns=['reservation_number', 'assessment_ts'])\n",
"\n",
"# Expected results\n",
"expected_results = pd.DataFrame([\n",
" ['RES001', datetime(2024, 6, 10, 14, 30, 0), datetime(2024, 6, 10, 12, 0, 0)], # Latest valid assessment\n",
" ['RES002', datetime(2024, 6, 12, 16, 45, 0), datetime(2024, 6, 11, 8, 15, 0)], # Single valid assessment\n",
" ['RES003', datetime(2024, 6, 13, 10, 15, 0), datetime(2024, 6, 13, 10, 15, 0)], # ActionTs (assessment ignored)\n",
" ['RES004', datetime(2024, 6, 14, 12, 0, 0), datetime(2024, 6, 14, 12, 0, 0)], # Assessment at exact ActionTs\n",
" ['RES005', datetime(2024, 6, 15, 9, 30, 0), datetime(2024, 6, 15, 9, 30, 0)], # ActionTs (no assessments)\n",
" ['RES006', datetime(2024, 6, 16, 18, 0, 0), datetime(2024, 6, 16, 10, 30, 0)], # Latest valid assessment\n",
" ['RES007', datetime(2024, 6, 17, 11, 30, 0), datetime(2024, 6, 17, 11, 30, 0)], # ActionTs (no assessments)\n",
" ['RES009', datetime(2024, 6, 19, 15, 45, 0), datetime(2024, 6, 19, 15, 0, 0)] # Valid assessment\n",
"], columns=['ReservationNo', 'ActionTs', 'scoring_ts'])"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "0ccb7a10",
"metadata": {},
"outputs": [],
"source": [
"# Test the SQL query with the test data\n",
"result = duckdb.sql(\"\"\"\n",
"WITH RankedAssessments AS (\n",
" SELECT\n",
" ass.reservation_number,\n",
" ass.assessment_ts,\n",
" ROW_NUMBER() OVER (PARTITION BY ass.reservation_number ORDER BY ass.assessment_ts DESC) as rn\n",
" FROM\n",
" assessments as ass\n",
" INNER JOIN\n",
" contracts as cont ON ass.reservation_number = cont.ReservationNo\n",
" WHERE\n",
" ass.assessment_ts <= cont.ActionTs\n",
")\n",
"SELECT\n",
" t.ReservationNo,\n",
" t.ActionTs,\n",
" coalesce(ra.assessment_ts, t.ActionTs) AS scoring_ts\n",
"FROM\n",
" contracts t\n",
"LEFT JOIN\n",
" RankedAssessments ra ON t.ReservationNo = ra.reservation_number AND ra.rn = 1\n",
"order by t.ReservationNo\n",
"\"\"\").to_df()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "65324a19",
"metadata": {},
"outputs": [],
"source": [
"# Pandas-based implementation of the same logic (adapted from scratch.ipynb)\n",
"pandas_result = contracts.merge(\n",
" pd.DataFrame(\n",
" assessments.merge(\n",
" contracts[[\"ActionTs\", \"ReservationNo\"]],\n",
" left_on=\"reservation_number\",\n",
" right_on=\"ReservationNo\",\n",
" how=\"left\",\n",
" )\n",
" .sort_values([\"reservation_number\", \"assessment_ts\"])\n",
" .drop(columns=[\"ReservationNo\"])\n",
" # Remove assessments that have no corresponding checkout timestamp\n",
" # that occur due to the merge\n",
" .dropna(subset=[\"ActionTs\"])\n",
" # Keep assessments that occur before or at the checkout timestamp\n",
" .loc[lambda df: df[\"assessment_ts\"] <= df[\"ActionTs\"]]\n",
" .groupby(\"reservation_number\")[\"assessment_ts\"]\n",
" .max()\n",
" ).reset_index(),\n",
" left_on=\"ReservationNo\",\n",
" right_on=\"reservation_number\",\n",
" how=\"left\",\n",
").drop(\"reservation_number\", axis=1)\n",
"\n",
"pandas_result[\"scoring_ts\"] = (\n",
" pandas_result[[\"assessment_ts\", \"ActionTs\"]].bfill(axis=1).iloc[:, 0]\n",
")\n",
"\n",
"pandas_result = pandas_result[[\"ReservationNo\", \"ActionTs\", \"scoring_ts\"]].sort_values(\"ReservationNo\").reset_index(drop=True).copy()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e3b864a5",
"metadata": {},
"outputs": [],
"source": [
"pd.testing.assert_frame_equal(pandas_result, expected_results)\n",
"pd.testing.assert_frame_equal(result, expected_results)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "mbam-scorecard-data-provisioning-kbTQ1mz3-py3.12",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment