Created
June 26, 2025 09:09
-
-
Save drorata/da8e11ac6bc07fa04f2fccdcc8d80881 to your computer and use it in GitHub Desktop.
Compare Pandas and SQL code for the same purpose
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "b9a3c479", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"from datetime import datetime\n", | |
"import duckdb" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "f6130ca1", | |
"metadata": {}, | |
"source": [ | |
"This notebook tests and compares two implementations for finding the latest valid assessment per reservation:\n", | |
"\n", | |
"1. **SQL Implementation**: Uses DuckDB with a CTE (Common Table Expression) to rank assessments and apply coalesce logic\n", | |
"2. **Pandas Implementation**: Uses DataFrame operations with merge, groupby, and backfill to achieve the same result\n", | |
"\n", | |
"## Test Logic\n", | |
"- For each reservation, find the **latest assessment** that occurs **before or at** the ActionTs timestamp\n", | |
"- If no valid assessment exists, use the **ActionTs** as the scoring timestamp (coalesce logic)\n", | |
"- Handle multiple assessments per reservation by selecting the most recent valid one\n", | |
"\n", | |
"## Test Cases\n", | |
"- Multiple assessments per reservation (latest valid selected)\n", | |
"- Single valid assessment\n", | |
"- Assessments after ActionTs (ignored)\n", | |
"- Assessment exactly at ActionTs (included)\n", | |
"- No assessments (ActionTs used as fallback)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "214d0912", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Create test data directly as dataframes using arrays for better readability\n", | |
"\n", | |
"# Test contracts data\n", | |
"contracts = pd.DataFrame([\n", | |
" ['RES001', datetime(2024, 6, 10, 14, 30, 0)],\n", | |
" ['RES002', datetime(2024, 6, 12, 16, 45, 0)],\n", | |
" ['RES003', datetime(2024, 6, 13, 10, 15, 0)],\n", | |
" ['RES004', datetime(2024, 6, 14, 12, 0, 0)],\n", | |
" ['RES005', datetime(2024, 6, 15, 9, 30, 0)],\n", | |
" ['RES006', datetime(2024, 6, 16, 18, 0, 0)],\n", | |
" ['RES007', datetime(2024, 6, 17, 11, 30, 0)],\n", | |
" ['RES009', datetime(2024, 6, 19, 15, 45, 0)]\n", | |
"], columns=['ReservationNo', 'ActionTs'])\n", | |
"\n", | |
"# Test assessments data\n", | |
"assessments = pd.DataFrame([\n", | |
" # Multiple assessments for RES001\n", | |
" ['RES001', datetime(2024, 6, 8, 10, 0, 0)], # Valid early assessment\n", | |
" ['RES001', datetime(2024, 6, 9, 15, 30, 0)], # Valid later assessment\n", | |
" ['RES001', datetime(2024, 6, 10, 12, 0, 0)], # Valid latest assessment before ActionTs\n", | |
"\n", | |
" # Single assessment for RES002\n", | |
" ['RES002', datetime(2024, 6, 11, 8, 15, 0)], # Valid assessment\n", | |
"\n", | |
" # Assessment after ActionTs for RES003 (should be ignored)\n", | |
" ['RES003', datetime(2024, 6, 14, 18, 0, 0)], # Invalid - after ActionTs\n", | |
"\n", | |
" # Assessment exactly at ActionTs for RES004\n", | |
" ['RES004', datetime(2024, 6, 14, 12, 0, 0)], # Exactly at ActionTs time\n", | |
"\n", | |
" # Multiple assessments for RES006, one after ActionTs\n", | |
" ['RES006', datetime(2024, 6, 15, 9, 0, 0)], # Valid assessment\n", | |
" ['RES006', datetime(2024, 6, 16, 10, 30, 0)], # Valid latest assessment before ActionTs\n", | |
" ['RES006', datetime(2024, 6, 17, 20, 0, 0)], # Invalid - after ActionTs\n", | |
"\n", | |
" # Multiple assessments for RES009\n", | |
" ['RES009', datetime(2024, 6, 19, 15, 0, 0)], # Valid assessment\n", | |
" ['RES009', datetime(2024, 6, 19, 16, 0, 0)] # Invalid - after ActionTs\n", | |
"\n", | |
" # No assessments for RES005, RES007 - will test coalesce logic\n", | |
"], columns=['reservation_number', 'assessment_ts'])\n", | |
"\n", | |
"# Expected results\n", | |
"expected_results = pd.DataFrame([\n", | |
" ['RES001', datetime(2024, 6, 10, 14, 30, 0), datetime(2024, 6, 10, 12, 0, 0)], # Latest valid assessment\n", | |
" ['RES002', datetime(2024, 6, 12, 16, 45, 0), datetime(2024, 6, 11, 8, 15, 0)], # Single valid assessment\n", | |
" ['RES003', datetime(2024, 6, 13, 10, 15, 0), datetime(2024, 6, 13, 10, 15, 0)], # ActionTs (assessment ignored)\n", | |
" ['RES004', datetime(2024, 6, 14, 12, 0, 0), datetime(2024, 6, 14, 12, 0, 0)], # Assessment at exact ActionTs\n", | |
" ['RES005', datetime(2024, 6, 15, 9, 30, 0), datetime(2024, 6, 15, 9, 30, 0)], # ActionTs (no assessments)\n", | |
" ['RES006', datetime(2024, 6, 16, 18, 0, 0), datetime(2024, 6, 16, 10, 30, 0)], # Latest valid assessment\n", | |
" ['RES007', datetime(2024, 6, 17, 11, 30, 0), datetime(2024, 6, 17, 11, 30, 0)], # ActionTs (no assessments)\n", | |
" ['RES009', datetime(2024, 6, 19, 15, 45, 0), datetime(2024, 6, 19, 15, 0, 0)] # Valid assessment\n", | |
"], columns=['ReservationNo', 'ActionTs', 'scoring_ts'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "0ccb7a10", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Test the SQL query with the test data\n", | |
"result = duckdb.sql(\"\"\"\n", | |
"WITH RankedAssessments AS (\n", | |
" SELECT\n", | |
" ass.reservation_number,\n", | |
" ass.assessment_ts,\n", | |
" ROW_NUMBER() OVER (PARTITION BY ass.reservation_number ORDER BY ass.assessment_ts DESC) as rn\n", | |
" FROM\n", | |
" assessments as ass\n", | |
" INNER JOIN\n", | |
" contracts as cont ON ass.reservation_number = cont.ReservationNo\n", | |
" WHERE\n", | |
" ass.assessment_ts <= cont.ActionTs\n", | |
")\n", | |
"SELECT\n", | |
" t.ReservationNo,\n", | |
" t.ActionTs,\n", | |
" coalesce(ra.assessment_ts, t.ActionTs) AS scoring_ts\n", | |
"FROM\n", | |
" contracts t\n", | |
"LEFT JOIN\n", | |
" RankedAssessments ra ON t.ReservationNo = ra.reservation_number AND ra.rn = 1\n", | |
"order by t.ReservationNo\n", | |
"\"\"\").to_df()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "65324a19", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Pandas-based implementation of the same logic (adapted from scratch.ipynb)\n", | |
"pandas_result = contracts.merge(\n", | |
" pd.DataFrame(\n", | |
" assessments.merge(\n", | |
" contracts[[\"ActionTs\", \"ReservationNo\"]],\n", | |
" left_on=\"reservation_number\",\n", | |
" right_on=\"ReservationNo\",\n", | |
" how=\"left\",\n", | |
" )\n", | |
" .sort_values([\"reservation_number\", \"assessment_ts\"])\n", | |
" .drop(columns=[\"ReservationNo\"])\n", | |
" # Remove assessments that have no corresponding checkout timestamp\n", | |
" # that occur due to the merge\n", | |
" .dropna(subset=[\"ActionTs\"])\n", | |
" # Keep assessments that occur before or at the checkout timestamp\n", | |
" .loc[lambda df: df[\"assessment_ts\"] <= df[\"ActionTs\"]]\n", | |
" .groupby(\"reservation_number\")[\"assessment_ts\"]\n", | |
" .max()\n", | |
" ).reset_index(),\n", | |
" left_on=\"ReservationNo\",\n", | |
" right_on=\"reservation_number\",\n", | |
" how=\"left\",\n", | |
").drop(\"reservation_number\", axis=1)\n", | |
"\n", | |
"pandas_result[\"scoring_ts\"] = (\n", | |
" pandas_result[[\"assessment_ts\", \"ActionTs\"]].bfill(axis=1).iloc[:, 0]\n", | |
")\n", | |
"\n", | |
"pandas_result = pandas_result[[\"ReservationNo\", \"ActionTs\", \"scoring_ts\"]].sort_values(\"ReservationNo\").reset_index(drop=True).copy()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "e3b864a5", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"pd.testing.assert_frame_equal(pandas_result, expected_results)\n", | |
"pd.testing.assert_frame_equal(result, expected_results)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "mbam-scorecard-data-provisioning-kbTQ1mz3-py3.12", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.12.7" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment