Skip to content

Instantly share code, notes, and snippets.

View 1ambda's full-sized avatar
🦁
in the jungle

Kun 1ambda

🦁
in the jungle
View GitHub Profile
case class MarketingUserRefined(userId: Int,
yearBirth: Int,
education: String,
income: Option[Int],
kidhome: Option[Int],
teenhome: Option[Int],
dtCustomer: String,
recency: Option[Int])
val dsUserRefined = dfSelected
val dsUserUpdated = dsUserFiltered
.map(x => x.copy(yearBirth = x.yearBirth + 3))
TragetUser(userId, income, hasKidhome, hasTeenhome, recency)
# Transformation 입니다. 아직 실행되지 않습니다.
df = spark.read.load("./marketing_campaign.csv",
format="csv", sep="\t", inferSchema="true", header="true")
# Transformation 입니다. 아직 실행되지 않습니다.
dfSelected = df.select(
col("ID").alias("id"),
col("Year_Birth").alias("year_birth"),
col("Education").alias("education"),
col("Kidhome").alias("count_kid"),
# Transformation 입니다. 아직 실행되지 않습니다.
df = spark.read.load("./marketing_campaign.csv",
format="csv", sep="\t", inferSchema="true", header="true")
# Transformation 입니다. 아직 실행되지 않습니다.
dfSelected = df.select(
col("ID").alias("id"),
col("Year_Birth").alias("year_birth"),
col("Education").alias("education"),
col("Kidhome").alias("count_kid"),
# dfSelected.explain("formatted") 의 실행 결과
(1) Scan csv
Output [7]: [ID#16, Year_Birth#17, Education#18, Kidhome#21, Teenhome#22, Dt_Customer#23, Recency#24]
Batched: false
Location: InMemoryFileIndex [file:/home/jovyan/private-notebook/spark-tutorial/marketing_campaign.csv]
ReadSchema: struct<ID:int,Year_Birth:int,Education:string,Kidhome:int,Teenhome:int,Dt_Customer:string,Recency:int>
(2) Project [codegen id : 1]
Output [7]: [ID#16 AS id#190, Year_Birth#17 AS year_birth#191, Education#18 AS education#192, Kidhome#21 AS count_kid#193, Teenhome#22 AS count_teen#194, Dt_Customer#23 AS date_customer#195, Recency#24 AS days_last_login#196]
Input [7]: [ID#16, Year_Birth#17, Education#18, Kidhome#21, Teenhome#22, Dt_Customer#23, Recency#24]
add_months(cast(gettimestamp(Dt_Customer#23, d-M-yyyy, Some(Asia/Seoul), false) as date), 72) AS date_joined#227
# `dfSelected.rdd.id` 출력 결과
<bound method RDD.id of MapPartitionsRDD[15] at javaToPython at NativeMethodAccessorImpl.java:0>
# `dfConverted.rdd.id` 출력 결과
<bound method RDD.id of MapPartitionsRDD[42] at javaToPython at NativeMethodAccessorImpl.java:0>
# dfConvertedxplain("extended")
== Parsed Logical Plan ==
'Project [id#236, year_birth#237, education#238, count_kid#239, count_teen#240, date_customer#241, days_last_login#242, add_months(to_date('date_customer, Some(d-M-yyyy)), 72) AS date_joined#257]
+- Project [ID#16 AS id#236, Year_Birth#17 AS year_birth#237, Education#18 AS education#238, Kidhome#21 AS count_kid#239, Teenhome#22 AS count_teen#240, Dt_Customer#23 AS date_customer#241, Recency#24 AS days_last_login#242]
+- Relation[ID#16,Year_Birth#17,Education#18,Marital_Status#19,Income#20,Kidhome#21,Teenhome#22,Dt_Customer#23,Recency#24,MntWines#25,MntFruits#26,MntMeatProducts#27,MntFishProducts#28,MntSweetProducts#29,MntGoldProds#30,NumDealsPurchases#31,NumWebPurchases#32,NumCatalogPurchases#33,NumStorePurchases#34,NumWebVisitsMonth#35,AcceptedCmp3#36,AcceptedCmp4#37,AcceptedCmp5#38,AcceptedCmp1#39,... 5 more fields] csv
== Analyzed Logical Plan ==
id: int, year_birth: int, education: string, count_kid: int, count_teen: int, date_customer: string
# dfConvertedxplain("extended")
== Parsed Logical Plan ==
'Project [id#236, year_birth#237, education#238, count_kid#239, count_teen#240, date_customer#241, days_last_login#242, add_months(to_date('date_customer, Some(d-M-yyyy)), 72) AS date_joined#257]
+- Project [ID#16 AS id#236, Year_Birth#17 AS year_birth#237, Education#18 AS education#238, Kidhome#21 AS count_kid#239, Teenhome#22 AS count_teen#240, Dt_Customer#23 AS date_customer#241, Recency#24 AS days_last_login#242]
+- Relation[ID#16,Year_Birth#17,Education#18,Marital_Status#19,Income#20,Kidhome#21,Teenhome#22,Dt_Customer#23,Recency#24,MntWines#25,MntFruits#26,MntMeatProducts#27,MntFishProducts#28,MntSweetProducts#29,MntGoldProds#30,NumDealsPurchases#31,NumWebPurchases#32,NumCatalogPurchases#33,NumStorePurchases#34,NumWebVisitsMonth#35,AcceptedCmp3#36,AcceptedCmp4#37,AcceptedCmp5#38,AcceptedCmp1#39,... 5 more fields] csv
== Analyzed Logical Plan ==
id: int, year_birth: int, education: string, count_kid: int, count_teen: int, date_customer: string