oluies’s gists

oluies / PrepareData.scala

Created April 8, 2016 09:52 — forked from anonymous/PrepareData.scala

	package com.combient.sparkjob.tedsds

	/**
	* Created by olu on 09/03/16.
	*/

	import org.apache.spark.{SparkContext, SparkConf}
	import org.apache.spark.sql.hive.HiveContext
	import org.apache.spark.sql.expressions.Window
	import org.apache.spark.sql.functions._

oluies / RunRandomForest2.scala

Created April 8, 2016 09:53 — forked from anonymous/RunRandomForest2.scala

	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*

oluies / twurl_allmeps

Created July 12, 2016 20:28

> $ twurl /1.1/lists/members.json --data 'count=700&owner_screen_name=Europarl_EN&slug=all-meps-on-twitter' | jq '.users[].id' > meps.txt

oluies / pascal.kt

Created August 1, 2016 19:27

	fun pascal(c: Int, r: Int): Int {
	require(c>=0,{"column must be larger than zero"})
	require(r>=0,{"row must be larger than zero"})

	return when {
	r == 0 -> 1
	r == 1 -> 1
	c == 0 -> 1
	c == r -> 1
	else -> pascal(c - 1, r - 1) + pascal(c, r - 1)

oluies / AllListContentFollowers.scala

Created August 1, 2016 19:35

	import com.typesafe.config.ConfigFactory
	import twitter4j.{RateLimitStatus, TwitterFactory}
	import twitter4j.conf.ConfigurationBuilder
	import java.util.{Timer, TimerTask}

	import scala.collection.JavaConverters._
	import scala.collection.mutable

	object AllListContentFollowers {
	def main(args : Array[String]): Unit = {

oluies / plotly_2D_density.py

Created August 19, 2016 11:49

	from plotly import __version__
	from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
	import plotly.plotly as py
	from plotly.graph_objs import *
	import pandas as pd
	import requests
	requests.packages.urllib3.disable_warnings()

	init_notebook_mode(connected=True)

oluies / spark_unix_timestamp.scala

Created August 19, 2016 11:57

Spark: transform timestamp text to timestamp and extract some parts

	val df =Seq((1L,"03JUN2015 19.28.00"),(2L,"#$@#@#")).toDF("id","dts")
	import org.apache.spark.sql.functions.dayofmonth

	import org.apache.spark.sql.functions.unix_timestamp

	df.withColumn("ts", unix_timestamp($"dts","ddMMMyy HH.mm.ss").cast("timestamp"))
	.withColumn("dom", dayofmonth($"ts"))
	.withColumn("month", month($"ts"))
	.withColumn("yesar", year($"ts"))
	.show(2,false)

oluies / sum_vector_spark.scala

Last active September 6, 2016 12:43

	import org.apache.spark.sql.Row
	import breeze.linalg.DenseVector
	import org.apache.spark.mllib.linalg.{Vector, Vectors}

	val t_df = sqlContext.read.parquet("/user/_/Pivoted_cust_weekday_total_with_Clusters.parquet")
	val tm_df = t_df.select("IP_ID","assembled")
	val emptyVector = DenseVector (Array.fill(7)(0.0))

	val zeVector = tm_df
	.rdd

oluies / vectorsum.scala

Created September 12, 2016 13:04

summarize a vector in spark

	import org.apache.spark.sql.Row
	import breeze.linalg.DenseVector
	import org.apache.spark.mllib.linalg.{Vector, Vectors}

	val t_df = sqlContext.read.parquet("/user/s89718/Pivoted_cust_weekday_total_with_Clusters.parquet")
	val tm_df = t_df.select("IP_ID","assembled")
	val emptyVector = BDV(Array.fill(7)(0.0))

	val zeVector = tm_df
	.rdd

oluies / checkSEPnr.scala

Created October 5, 2016 15:27

	def checkSEPnr(pnr:String) = {

	val chars = pnr.toList
	val removeMinus = chars.view.filter(_ != '-')
	val charToInt = removeMinus.view.map(_ - '0')
	val lunsum: Int = charToInt.take(9).foldLeft( (0,2) ){
	(r,c) =>
	(r._1 + (c * r._2) / 10 + (c * r._2) % 10, if (r._2 == 2) 1 else 2)
	}._1 % 10

Örjan Lundberg oluies