Skip to content

Instantly share code, notes, and snippets.

View 1ambda's full-sized avatar
🦁
in the jungle

Kun 1ambda

🦁
in the jungle
View GitHub Profile
// df.printSchema()
root
|-- ID: integer (nullable = true)
|-- Year_Birth: integer (nullable = true)
|-- Education: string (nullable = true)
|-- Marital_Status: string (nullable = true)
|-- Income: integer (nullable = true)
|-- Kidhome: integer (nullable = true)
|-- Teenhome: integer (nullable = true)
def camelToUnderscores(name: String) = "[A-Z\\d]".r.replaceAllIn(name, {m =>
"_" + m.group(0).toLowerCase()
})
def underscoresToCamel(name: String) = {
val splitted = name.split("_")
val converted = splitted.head.toLowerCase() +: splitted.tail.map(_.capitalize)
converted.mkString
}
// dfLower.select("id", "year_birth", "education", "income", "kidhome", "teenhome", "dt_customer").show()
+----+----------+----------+------+-------+--------+-----------+
| id|year_birth| education|income|kidhome|teenhome|dt_customer|
+----+----------+----------+------+-------+--------+-----------+
|5524| 1957|Graduation| 58138| 0| 0| 04-09-2012|
|2174| 1954|Graduation| 46344| 1| 1| 08-03-2014|
|4141| 1965|Graduation| 71613| 0| 0| 21-08-2013|
|6182| 1984|Graduation| 26646| 1| 0| 10-02-2014|
|5324| 1981| PhD| 58293| 1| 0| 19-01-2014|
|7446| 1967| Master| 62513| 0| 1| 09-09-2013|
val dfSelected = dfCamel.select("id", "yearBirth", "education", "income", "kidhome", "teenhome", "dtCustomer", "recency").cache()
case class MarketingUser(userId: Int,
yearBirth: Int,
education: String,
income: Int,
kidhome: Int,
teenhome: Int,
dtCustomer: String,
recency: Int)
val dsMarketingUser = dfSelected
.withColumnRenamed("id", "userId")
.as[MarketingUser]
# dfSelected
org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, yearBirth: int ... 6 more fields]
# dsMaretingUser
org.apache.spark.sql.Dataset[MarketingUser] = [userId: int, yearBirth: int ... 6 more fields]
dsMarketingUser.groupBy(col("education")).agg(sum("income"))
val dsUserFiltered = dsMarketingUser.filter(x => x.education == "Master")
# dsUserFiltered.show()
Caused by: NullPointerException: Null value appeared in non-nullable field:
- field (class: "scala.Int", name: "income")
- root class: "$line9228218dd17046b3aa8b6bbe593dc28a520.$read.$iw.$iw.$iw.$iw.$iw.$iw.$iw.$iw.$iw.$iw.$iw.$iw.$iw.$iw.MarketingUser"
If the schema is inferred from a Scala tuple/case class, or a Java bean, please try to use scala.Option[_] or other nullable types (e.g. java.lang.Integer instead of int/scala.Int).