Skip to content

Instantly share code, notes, and snippets.

@Hungsiro506
Last active November 23, 2017 04:22
Show Gist options
  • Save Hungsiro506/4bab73a91c591f6a04edc267d68585ff to your computer and use it in GitHub Desktop.
Save Hungsiro506/4bab73a91c591f6a04edc267d68585ff to your computer and use it in GitHub Desktop.
scala> val dns = spark.sqlContext.read.parquet("/data/dns/dns-extracted-two-hours/2017-11-22-02/out/")
dns: org.apache.spark.sql.DataFrame = [value: string]
scala> val splited = dns.withColumn("temp",split(col("value"),"\\t"))
splited: org.apache.spark.sql.DataFrame = [value: string, temp: array<string>]
scala> val df = splited.select((0 until 25).map(i => col("temp").getItem(i).as(s"col$i")): _*)
df: org.apache.spark.sql.DataFrame = [col0: string, col1: string ... 23 more fields]
scala> val npic = df.where("col24 = '-1'").select("col2")
npic: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [col0: string, col1: string ... 23 more fields]
scala> npic.write.format("csv").option("header","false").save("/data/dns/npic_2017_11_22_02")
:paste
enter
-- paste sthm here --
ctrl + d
////////////////////////////////////////////////////////////////////////////////////////////////////
def export_distinct_npic(x: Int) = {
//val hour = String.format("%02d", x)
val hour = "%02d".format(x)
println(hour)
val dns = spark.sqlContext.read.parquet(s"/data/dns/dns-extracted-two-hours/2017-11-22-${hour}/out/")
val splited = dns.withColumn("temp",split(col("value"),"\\t"))
val df = splited.select((0 until 25).map(i => col("temp").getItem(i).as(s"col$i")): _*)
val npic = df.where("col24 = '-1'").select("col2").distinct
npic.write.format("csv").option("header","false").mode("overwrite").save(s"/data/dns/npic_2017_11_22_${hour}")
}
def exc() = {
for(x <- 0 to 22 by 2){
export_distinct_npic(x)
}
}
////////////////////////////////////////////////////////////////////////////////////////////////////
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment