Last active
January 21, 2022 05:01
-
-
Save thanoojgithub/cc38e45eab0bf8f9e0c400bd1def24f0 to your computer and use it in GitHub Desktop.
PySpark Example One
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.sql import SparkSession | |
from pyspark.sql import Window | |
from pyspark.sql.functions import * | |
spark = SparkSession \ | |
.builder \ | |
.master('local') \ | |
.appName('pyspark-test-run') \ | |
.getOrCreate() | |
df = spark.read.format("csv").options(header='True', inferSchema='True', delimiter=',') \ | |
.load("gs://bucket_27122021/data/travalData/airLineData/Airline_Details.csv") | |
df.write.mode("overwrite").parquet("gs://bucket_27122021/data/travalData/airLineData/parquet/airLineData.parquet") | |
df1 = spark.read.parquet("gs://bucket_27122021/data/travalData/airLineData/parquet/airLineData.parquet") \ | |
.drop('CreatedTimestamp', 'UpdatedTimestamp') | |
print("df1 : ***********") | |
print("before cleansing airLine row count : ") | |
print(df1.count()) | |
df1.show() | |
# AirlineID,AirlineName,Carrier,TailNum | |
df2 = df1.filter(col("AirlineID").isNotNull() & col("AirlineName").isNotNull() & col("Carrier").isNotNull() & | |
col("TailNum").isNotNull()) | |
print("df2 : ***********") | |
df2.show() | |
# df3 = df2.dropDuplicates(["AirlineID", "AirlineName", "Carrier", "TailNum"]) | |
# when, considering all columns in the table for distinct then, dropDuplicates() or specific columns then, above line | |
df31 = df2.dropDuplicates() | |
print("df31 : ***********") | |
df31.show() | |
# DeDuping using row_number if we have deciding column to pick one | |
win1 = Window.partitionBy('AirlineID', 'AirlineName', 'Carrier').orderBy(col('TailNum').desc()) | |
df3 = df2.withColumn('rn', row_number().over(win1)).where('rn = 1').drop('rn') | |
print("df3 : ***********") | |
df3.show() | |
# df3 = df2.withColumn('startDate', date_format(current_timestamp(), 'yyyy-MM-dd')) | |
# current_date() | |
df4 = df31.withColumn("startDate", current_timestamp()) \ | |
.withColumn("endDate", (unix_timestamp() + 999999999).cast('timestamp')) | |
print("df4 : ***********") | |
df4.show(truncate=False) | |
print("after cleansing airLine row count : ") | |
print(df4.count()) | |
df1 : *********** | |
before cleansing airLine row count : | |
9878 | |
+---------+---------------+-------+-------+ | |
|AirlineID| AirlineName|Carrier|TailNum| | |
+---------+---------------+-------+-------+ | |
| 19393|Alaska Airlines| WN| N288WN| | |
| 19393|Alaska Airlines| WN| N238WN| | |
| 19393|Alaska Airlines| WN| N8325D| | |
| 19393|Alaska Airlines| WN| N7719A| | |
| 19393|Alaska Airlines| WN| N775SW| | |
| 19393|Alaska Airlines| WN| N761RR| | |
| 19393|Alaska Airlines| WN| N8652B| | |
| 19393|Alaska Airlines| WN| N357SW| | |
| 19393|Alaska Airlines| WN| N290WN| | |
| 19393|Alaska Airlines| WN| N421LV| | |
| 19393|Alaska Airlines| WN| N665WN| | |
| 19393|Alaska Airlines| WN| N555LV| | |
| 19393|Alaska Airlines| WN| N633SW| | |
| 19393|Alaska Airlines| WN| N7711N| | |
| 19393|Alaska Airlines| WN| N439WN| | |
| 19393|Alaska Airlines| WN| N283WN| | |
| 19393|Alaska Airlines| WN| N246LV| | |
| 19393|Alaska Airlines| WN| N427WN| | |
| 19393|Alaska Airlines| WN| N399WN| | |
| 19393|Alaska Airlines| WN| N652SW| | |
+---------+---------------+-------+-------+ | |
only showing top 20 rows | |
df2 : *********** | |
+---------+---------------+-------+-------+ | |
|AirlineID| AirlineName|Carrier|TailNum| | |
+---------+---------------+-------+-------+ | |
| 19393|Alaska Airlines| WN| N288WN| | |
| 19393|Alaska Airlines| WN| N238WN| | |
| 19393|Alaska Airlines| WN| N8325D| | |
| 19393|Alaska Airlines| WN| N7719A| | |
| 19393|Alaska Airlines| WN| N775SW| | |
| 19393|Alaska Airlines| WN| N761RR| | |
| 19393|Alaska Airlines| WN| N8652B| | |
| 19393|Alaska Airlines| WN| N357SW| | |
| 19393|Alaska Airlines| WN| N290WN| | |
| 19393|Alaska Airlines| WN| N421LV| | |
| 19393|Alaska Airlines| WN| N665WN| | |
| 19393|Alaska Airlines| WN| N555LV| | |
| 19393|Alaska Airlines| WN| N633SW| | |
| 19393|Alaska Airlines| WN| N7711N| | |
| 19393|Alaska Airlines| WN| N439WN| | |
| 19393|Alaska Airlines| WN| N283WN| | |
| 19393|Alaska Airlines| WN| N246LV| | |
| 19393|Alaska Airlines| WN| N427WN| | |
| 19393|Alaska Airlines| WN| N399WN| | |
| 19393|Alaska Airlines| WN| N652SW| | |
+---------+---------------+-------+-------+ | |
only showing top 20 rows | |
df31 : *********** | |
+---------+-----------------+-------+-------+ | |
|AirlineID| AirlineName|Carrier|TailNum| | |
+---------+-----------------+-------+-------+ | |
| 19393| Alaska Airlines| WN| N784SW| | |
| 19790|American Airlines| DL| N971DL| | |
| 19790|American Airlines| DL| N911DA| | |
| 19790|American Airlines| DL| N991DL| | |
| 19790|American Airlines| DL| N306DN| | |
| 19790|American Airlines| DL| N863DN| | |
| 19805| Delta AirLines| AA| N438AA| | |
| 19805| Delta AirLines| AA| N3BGAA| | |
| 19805| Delta AirLines| AA| N7LNAA| | |
| 19805| Delta AirLines| AA| N8AFAA| | |
| 19805| Delta AirLines| AA| N521UW| | |
| 19805| Delta AirLines| AA| N8LLAA| | |
| 19977|Hawaiian Airlines| UA| N420UA| | |
| 19977|Hawaiian Airlines| UA| N15751| | |
| 19977|Hawaiian Airlines| UA| N69816| | |
| 19977|Hawaiian Airlines| UA| N76062| | |
| 19977|Hawaiian Airlines| UA| N220UA| | |
| 19977|Hawaiian Airlines| UA| N128UA| | |
| 20304| JetBlue Airlines| OO| N692CA| | |
| 20304| JetBlue Airlines| OO| N641CA| | |
+---------+-----------------+-------+-------+ | |
only showing top 20 rows | |
df3 : *********** | |
+---------+------------------+-------+-------+ | |
|AirlineID| AirlineName|Carrier|TailNum| | |
+---------+------------------+-------+-------+ | |
| 19393| Alaska Airlines| WN| N969WN| | |
| 19690|Allegiant Airlines| HA| N594HA| | |
| 19790| American Airlines| DL| N999DN| | |
| 19805| Delta AirLines| AA| N998AA| | |
| 19930| Frontier Airlines| AS| N799AS| | |
| 19977| Hawaiian Airlines| UA| N896UA| | |
| 20304| JetBlue Airlines| OO| N988CA| | |
| 20366|Southwest Airlines| EV| N981EV| | |
| 20409| Spirit Airlines| B6| N990JB| | |
| 20416| United Airlines| NK| N905NK| | |
| 20436| ABC Airlines| F9| PLANET| | |
| 21171| XYZ Airlines| VX| N925VA| | |
+---------+------------------+-------+-------+ | |
df4 : *********** | |
+---------+-----------------+-------+-------+-----------------------+-------------------+ | |
|AirlineID|AirlineName |Carrier|TailNum|startDate |endDate | | |
+---------+-----------------+-------+-------+-----------------------+-------------------+ | |
|19393 |Alaska Airlines |WN |N784SW |2022-01-21 04:56:36.842|2053-09-29 06:43:15| | |
|19790 |American Airlines|DL |N971DL |2022-01-21 04:56:36.842|2053-09-29 06:43:15| | |
|19790 |American Airlines|DL |N911DA |2022-01-21 04:56:36.842|2053-09-29 06:43:15| | |
|19790 |American Airlines|DL |N991DL |2022-01-21 04:56:36.842|2053-09-29 06:43:15| | |
|19790 |American Airlines|DL |N306DN |2022-01-21 04:56:36.842|2053-09-29 06:43:15| | |
|19790 |American Airlines|DL |N863DN |2022-01-21 04:56:36.842|2053-09-29 06:43:15| | |
|19805 |Delta AirLines |AA |N438AA |2022-01-21 04:56:36.842|2053-09-29 06:43:15| | |
|19805 |Delta AirLines |AA |N3BGAA |2022-01-21 04:56:36.842|2053-09-29 06:43:15| | |
|19805 |Delta AirLines |AA |N7LNAA |2022-01-21 04:56:36.842|2053-09-29 06:43:15| | |
|19805 |Delta AirLines |AA |N8AFAA |2022-01-21 04:56:36.842|2053-09-29 06:43:15| | |
|19805 |Delta AirLines |AA |N521UW |2022-01-21 04:56:36.842|2053-09-29 06:43:15| | |
|19805 |Delta AirLines |AA |N8LLAA |2022-01-21 04:56:36.842|2053-09-29 06:43:15| | |
|19977 |Hawaiian Airlines|UA |N420UA |2022-01-21 04:56:36.842|2053-09-29 06:43:15| | |
|19977 |Hawaiian Airlines|UA |N15751 |2022-01-21 04:56:36.842|2053-09-29 06:43:15| | |
|19977 |Hawaiian Airlines|UA |N69816 |2022-01-21 04:56:36.842|2053-09-29 06:43:15| | |
|19977 |Hawaiian Airlines|UA |N76062 |2022-01-21 04:56:36.842|2053-09-29 06:43:15| | |
|19977 |Hawaiian Airlines|UA |N220UA |2022-01-21 04:56:36.842|2053-09-29 06:43:15| | |
|19977 |Hawaiian Airlines|UA |N128UA |2022-01-21 04:56:36.842|2053-09-29 06:43:15| | |
|20304 |JetBlue Airlines |OO |N692CA |2022-01-21 04:56:36.842|2053-09-29 06:43:15| | |
|20304 |JetBlue Airlines |OO |N641CA |2022-01-21 04:56:36.842|2053-09-29 06:43:15| | |
+---------+-----------------+-------+-------+-----------------------+-------------------+ | |
only showing top 20 rows | |
after cleansing airLine row count : | |
5383 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment