Skip to content

Instantly share code, notes, and snippets.

@dr-dror
Created September 27, 2024 15:06
Show Gist options
  • Save dr-dror/0f59e31e4d7d7dbd8d191ba816bd67dd to your computer and use it in GitHub Desktop.
Save dr-dror/0f59e31e4d7d7dbd8d191ba816bd67dd to your computer and use it in GitHub Desktop.
Pivot a dataframe directly using Spark SQL
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"from pyspark.sql import SparkSession, DataFrame\n",
"\n",
"spark = (\n",
" SparkSession.builder\n",
" .getOrCreate()\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"def display_df(df: DataFrame, n_rows: int = 5):\n",
" pandas_df = df.limit(n_rows).toPandas()\n",
" return pandas_df"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"df = spark.read.parquet(\"./in_Progress/Pivot using Spark SQL/data.parquet\")\n",
"df.createOrReplaceTempView(\"data\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"display_df(df)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"with open(\"./in_Progress/Pivot using Spark SQL/query.sql\", \"r\") as f:\n",
" query = f.read()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"count_df = spark.sql(query.format(agg_func=\"count\"))\n",
"count_df = count_df.toPandas().set_index(\"passenger_count\")\n",
"count_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sum_df = spark.sql(query.format(agg_func=\"sum\"))\n",
"sum_df = sum_df.toPandas().set_index(\"passenger_count\")\n",
"sum_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sum_df / count_df"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
SELECT
*
FROM
(
SELECT
date_format (tpep_pickup_datetime, 'EEEE') AS day_of_week,
fare_amount,
passenger_count
FROM
data
) PIVOT (
{agg_func}(fare_amount) FOR day_of_week IN (
'Monday',
'Tuesday',
'Wednesday',
'Thursday',
'Friday',
'Saturday',
'Sunday'
)
)
ORDER BY
passenger_count;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment