Created
November 11, 2022 07:27
-
-
Save drorata/abe0469bc553b5903889b704f2451f78 to your computer and use it in GitHub Desktop.
PySpark and time zones
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'Europe/Berlin'" | |
] | |
}, | |
"execution_count": 1, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"from pyspark.sql import SparkSession\n", | |
"from pyspark.sql import functions as F\n", | |
"import pandas as pd\n", | |
"\n", | |
"\n", | |
"spark = SparkSession.builder \\\n", | |
" .master(\"local[1]\") \\\n", | |
" .appName(\"SparkByExamples.com\") \\\n", | |
" .getOrCreate()\n", | |
"\n", | |
"spark.conf.get('spark.sql.session.timeZone')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+----+-----------------------+\n", | |
"|name| ts|\n", | |
"+----+-----------------------+\n", | |
"| foo| 2022-10-29 10:56:23|\n", | |
"| bar| 2022-12-29 10:56:23|\n", | |
"| bar| 2022-12-30 10:56:23|\n", | |
"| foo|2022-10-29 10:56:23 UTC|\n", | |
"| bar|2022-12-29 10:56:23 UTC|\n", | |
"| bar|2022-12-30 10:56:23 UTC|\n", | |
"| foo|2022-10-29 10:56:23 CET|\n", | |
"| bar|2022-12-29 10:56:23 CET|\n", | |
"| bar|2022-12-30 10:56:23 CET|\n", | |
"+----+-----------------------+\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"pdf = pd.DataFrame(\n", | |
" [\n", | |
" [\"foo\", \"2022-10-29 10:56:23\"],\n", | |
" [\"bar\", \"2022-12-29 10:56:23\"],\n", | |
" [\"bar\", \"2022-12-30 10:56:23\"],\n", | |
" [\"foo\", \"2022-10-29 10:56:23 UTC\"],\n", | |
" [\"bar\", \"2022-12-29 10:56:23 UTC\"],\n", | |
" [\"bar\", \"2022-12-30 10:56:23 UTC\"],\n", | |
" [\"foo\", \"2022-10-29 10:56:23 CET\"],\n", | |
" [\"bar\", \"2022-12-29 10:56:23 CET\"],\n", | |
" [\"bar\", \"2022-12-30 10:56:23 CET\"],\n", | |
" ], columns=[\"name\", \"ts\"]\n", | |
")\n", | |
"pdf.to_csv(\"test_data.csv\", index=False)\n", | |
"\n", | |
"sdf = spark.read.csv(\"./test_data.csv\", header=True)\n", | |
"sdf.show(truncate=30)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+----+-----------------------+-------------------+-----------+-------------------+-----------+\n", | |
"|name| ts| new_ts_1|new_ts_1_tz| new_ts_2|new_ts_2_tz|\n", | |
"+----+-----------------------+-------------------+-----------+-------------------+-----------+\n", | |
"| foo| 2022-10-29 10:56:23|2022-10-29 10:56:23| UTC|2022-10-29 12:56:23| CET|\n", | |
"| bar| 2022-12-29 10:56:23|2022-12-29 10:56:23| UTC|2022-12-29 11:56:23| CET|\n", | |
"| bar| 2022-12-30 10:56:23|2022-12-30 10:56:23| UTC|2022-12-30 11:56:23| CET|\n", | |
"| foo|2022-10-29 10:56:23 UTC|2022-10-29 10:56:23| UTC|2022-10-29 12:56:23| CET|\n", | |
"| bar|2022-12-29 10:56:23 UTC|2022-12-29 10:56:23| UTC|2022-12-29 11:56:23| CET|\n", | |
"| bar|2022-12-30 10:56:23 UTC|2022-12-30 10:56:23| UTC|2022-12-30 11:56:23| CET|\n", | |
"| foo|2022-10-29 10:56:23 CET|2022-10-29 08:56:23| UTC|2022-10-29 10:56:23| CET|\n", | |
"| bar|2022-12-29 10:56:23 CET|2022-12-29 09:56:23| UTC|2022-12-29 10:56:23| CET|\n", | |
"| bar|2022-12-30 10:56:23 CET|2022-12-30 09:56:23| UTC|2022-12-30 10:56:23| CET|\n", | |
"+----+-----------------------+-------------------+-----------+-------------------+-----------+\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"new_sdf = (\n", | |
" sdf\n", | |
" .withColumn(\"new_ts_1\",F.from_utc_timestamp(F.col(\"ts\"), tz=\"UTC\"))\n", | |
" .withColumn(\"new_ts_1_tz\",F.lit(\"UTC\"))\n", | |
" .withColumn(\"new_ts_2\",F.from_utc_timestamp(F.col(\"ts\"), tz=\"CET\"))\n", | |
" .withColumn(\"new_ts_2_tz\",F.lit(\"CET\"))\n", | |
")\n", | |
"new_sdf.show(truncate=40)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+----+-----------------------+-------------------+-----------+-------------------+-----------+\n", | |
"|name| ts| new_ts_1|new_ts_1_tz| new_ts_2|new_ts_2_tz|\n", | |
"+----+-----------------------+-------------------+-----------+-------------------+-----------+\n", | |
"| foo| 2022-10-29 10:56:23|2022-10-29 10:56:23| UTC|2022-10-29 12:56:23| CET|\n", | |
"| bar| 2022-12-29 10:56:23|2022-12-29 10:56:23| UTC|2022-12-29 11:56:23| CET|\n", | |
"| bar| 2022-12-30 10:56:23|2022-12-30 10:56:23| UTC|2022-12-30 11:56:23| CET|\n", | |
"| foo|2022-10-29 10:56:23 UTC|2022-10-29 10:56:23| UTC|2022-10-29 12:56:23| CET|\n", | |
"| bar|2022-12-29 10:56:23 UTC|2022-12-29 10:56:23| UTC|2022-12-29 11:56:23| CET|\n", | |
"| bar|2022-12-30 10:56:23 UTC|2022-12-30 10:56:23| UTC|2022-12-30 11:56:23| CET|\n", | |
"| foo|2022-10-29 10:56:23 CET|2022-10-29 08:56:23| UTC|2022-10-29 10:56:23| CET|\n", | |
"| bar|2022-12-29 10:56:23 CET|2022-12-29 09:56:23| UTC|2022-12-29 10:56:23| CET|\n", | |
"| bar|2022-12-30 10:56:23 CET|2022-12-30 09:56:23| UTC|2022-12-30 10:56:23| CET|\n", | |
"+----+-----------------------+-------------------+-----------+-------------------+-----------+\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"spark.conf.set('spark.sql.session.timeZone', 'UTC')\n", | |
"\n", | |
"new_sdf_2 = (\n", | |
" sdf\n", | |
" .withColumn(\"new_ts_1\",F.from_utc_timestamp(F.col(\"ts\"), tz=\"UTC\"))\n", | |
" .withColumn(\"new_ts_1_tz\",F.lit(\"UTC\"))\n", | |
" .withColumn(\"new_ts_2\",F.from_utc_timestamp(F.col(\"ts\"), tz=\"CET\"))\n", | |
" .withColumn(\"new_ts_2_tz\",F.lit(\"CET\"))\n", | |
")\n", | |
"new_sdf_2.show(truncate=40)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[(datetime.datetime(2022, 10, 29, 14, 56, 23), None, datetime.datetime),\n", | |
" (datetime.datetime(2022, 12, 29, 12, 56, 23), None, datetime.datetime),\n", | |
" (datetime.datetime(2022, 12, 30, 12, 56, 23), None, datetime.datetime),\n", | |
" (datetime.datetime(2022, 10, 29, 14, 56, 23), None, datetime.datetime),\n", | |
" (datetime.datetime(2022, 12, 29, 12, 56, 23), None, datetime.datetime),\n", | |
" (datetime.datetime(2022, 12, 30, 12, 56, 23), None, datetime.datetime),\n", | |
" (datetime.datetime(2022, 10, 29, 12, 56, 23), None, datetime.datetime),\n", | |
" (datetime.datetime(2022, 12, 29, 11, 56, 23), None, datetime.datetime),\n", | |
" (datetime.datetime(2022, 12, 30, 11, 56, 23), None, datetime.datetime)]" | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"[\n", | |
" (x[\"new_ts_2\"], x[\"new_ts_2\"].tzinfo, type(x[\"new_ts_2\"])) \n", | |
" for x in new_sdf_2.select(F.col(\"new_ts_2\")).collect()\n", | |
"]" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3.10.7 ('scratch-IUQDp1F6-py3.10')", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.7" | |
}, | |
"orig_nbformat": 4, | |
"vscode": { | |
"interpreter": { | |
"hash": "7d68ff04d9429943008cd91afbf0528b8b60e825497062378ef2d49c637102f1" | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment