oluies’s gists

oluies / awk split on ;

Created December 7, 2016 21:36

echo "15;23;35#18;14;89" | tr '#' '\012' | awk -F';' '{for (i=1;i<=NF;i++){print NR, i, $(i)} }'

oluies / Example_HIVE_and_SparkSQL.ipynb

Created December 2, 2016 10:17

Sorry, something went wrong. Reload?

Sorry, we cannot display this file.

Sorry, this file is invalid so it cannot be displayed.

oluies / IPv4ToLong.scala

Created November 23, 2016 14:23

	import java.net.InetAddress
	def IPv4ToLong(dottedIP: String): Long = {
	val addrArray: Array[String] = dottedIP.split("\\.")
	var num: Long = 0
	var i: Int = 0
	while (i < addrArray.length) {
	val power: Int = 3 - i
	num = num + ((addrArray(i).toInt % 256) * Math.pow(256, power)).toLong
	i += 1
	}

oluies / loggik.scala

Created October 26, 2016 11:18


	import org.apache.spark.sql.types.{DoubleType,LongType,ShortType, IntegerType, StructField,TimestampType, StructType,StringType,NumericType,BooleanType}
	import org.apache.hadoop.fs.{FileSystem,Path}

	val sqlContext = new org.apache.spark.sql.SQLContext(sc)
	import sqlContext.implicits._

	def getschemametod(): StructType = {
	StructType(
	Seq(

oluies / gist:fd198ed80a4280f53574bca79b21d834

Last active October 25, 2016 08:06

	import org.apache.spark.sql.types.{DoubleType,LongType,ShortType, IntegerType, StructField,TimestampType, StructType,StringType,NumericType,BooleanType}
	import org.apache.hadoop.fs.{FileSystem,Path}

	val sqlContext = new org.apache.spark.sql.SQLContext(sc)
	import sqlContext.implicits._

	def csvToDF(file: Path, delimiter : String,charset: String = "UTF8", useHeader: Boolean = true, schema: Option[StructType] = None) = {

	val df = schema match {
	case Some(schema) => sqlContext.read

oluies / sessionisation_in_spark.py

Last active October 7, 2016 16:59

	sqlloggik4_df = """
	SELECT *
	, CAST(id as BIGINT) *10000 + SUM(new_session)
	OVER (PARTITION BY id ORDER BY starttid)
	AS session_id
	FROM(
	SELECT *,
	unix_timestamp(l.starttid) - LAG(unix_timestamp(l.starttid)) OVER (PARTITION BY l.id ORDER BY l.starttid) timesincelast,
	CASE
	WHEN unix_timestamp(l.starttid) - LAG(unix_timestamp(l.starttid)) OVER (PARTITION BY l.id ORDER BY l.starttid) >= 30 * 60

oluies / checkSEPnr.scala

Created October 5, 2016 15:27

	def checkSEPnr(pnr:String) = {

	val chars = pnr.toList
	val removeMinus = chars.view.filter(_ != '-')
	val charToInt = removeMinus.view.map(_ - '0')
	val lunsum: Int = charToInt.take(9).foldLeft( (0,2) ){
	(r,c) =>
	(r._1 + (c * r._2) / 10 + (c * r._2) % 10, if (r._2 == 2) 1 else 2)
	}._1 % 10

oluies / vectorsum.scala

Created September 12, 2016 13:04

summarize a vector in spark

	import org.apache.spark.sql.Row
	import breeze.linalg.DenseVector
	import org.apache.spark.mllib.linalg.{Vector, Vectors}

	val t_df = sqlContext.read.parquet("/user/s89718/Pivoted_cust_weekday_total_with_Clusters.parquet")
	val tm_df = t_df.select("IP_ID","assembled")
	val emptyVector = BDV(Array.fill(7)(0.0))

	val zeVector = tm_df
	.rdd

oluies / sum_vector_spark.scala

Last active September 6, 2016 12:43

	import org.apache.spark.sql.Row
	import breeze.linalg.DenseVector
	import org.apache.spark.mllib.linalg.{Vector, Vectors}

	val t_df = sqlContext.read.parquet("/user/_/Pivoted_cust_weekday_total_with_Clusters.parquet")
	val tm_df = t_df.select("IP_ID","assembled")
	val emptyVector = DenseVector (Array.fill(7)(0.0))

	val zeVector = tm_df
	.rdd

oluies / spark_unix_timestamp.scala

Created August 19, 2016 11:57

Spark: transform timestamp text to timestamp and extract some parts

	val df =Seq((1L,"03JUN2015 19.28.00"),(2L,"#$@#@#")).toDF("id","dts")
	import org.apache.spark.sql.functions.dayofmonth

	import org.apache.spark.sql.functions.unix_timestamp

	df.withColumn("ts", unix_timestamp($"dts","ddMMMyy HH.mm.ss").cast("timestamp"))
	.withColumn("dom", dayofmonth($"ts"))
	.withColumn("month", month($"ts"))
	.withColumn("yesar", year($"ts"))
	.show(2,false)

Örjan Lundberg oluies