oluies’s gists

oluies / gce-vpn-install.sh

Created February 7, 2016 12:51 — forked from greenido/gce-vpn-install.sh

Installing vpn on GCE

	#!/bin/sh
	#
	# Automatic configuration of a VPN on GCE debian-7-wheezy server.
	# Tested only on debian-7-wheezy.
	#
	# This work is licensed under the Creative Commons Attribution-ShareAlike 3.0
	# Unported License: http://creativecommons.org/licenses/by-sa/3.0/
	#
	# Thx to: https://github.com/sarfata/voodooprivacy/blob/master/voodoo-vpn.sh for the code/idea
	#

oluies / sparkmovingavg.scala

Last active February 27, 2016 19:45

spark moving average

oluies / rme.scala

Created February 28, 2016 08:26

rolling mean rolling stddev 21 variables

	// PARTITION BY id ORDER BY cykle ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING (5)
	val w = Window.partitionBy("id").orderBy("cykle").rowsBetween(-2, 2)

	//def getcols(id:String):Seq[Column] = Seq(mean($"s"+id).over(w).as("a"+id) sqrt( sum(pow($"s"+id - mean($"s"+id).over(w),2)).over(w) / 5).as("sd"+id))

	val x = df.select('*,
	mean($"s1").over(w).as("a1"),
	sqrt( sum(pow($"s1" - mean($"s1").over(w),2)).over(w) / 5).as("sd1"),

	mean($"s2").over(w).as("a2"),

oluies / gist:40ad3c6a1a3b76548b5e

Created March 1, 2016 09:33

	\|-- id: integer (nullable = true)
	\|-- cykle: integer (nullable = true)
	\|-- setting1: double (nullable = true)
	\|-- setting2: double (nullable = true)
	\|-- setting3: double (nullable = true)
	\|-- s1: double (nullable = true)
	\|-- s2: double (nullable = true)
	\|-- s3: double (nullable = true)
	\|-- s4: double (nullable = true)
	\|-- s5: double (nullable = true)

oluies / start_ipython_notebook.sh

Created March 8, 2016 09:57

	#!/bin/sh
	. /etc/spark/conf/spark-env.sh
	export PYSPARK_SUBMIT_ARGS="--master yarn-client pyspark-shell"

	# Spark master url. eg. spark://master_addr:7077. Leave empty if you want to use local mode
	export MASTER=yarn-client
	export SPARK_YARN_JAR=hdfs:///apps/zeppelin/zeppelin-spark-0.5.5-SNAPSHOT.jar
	export JAVA_HOME=/usr/jdk64/jdk1.8.0_60
	# Additional jvm options. for example, export ZEPPELIN_JAVA_OPTS="-Dspark.executor.memory=8g -Dspark.cores.max=16"
	export JAVA_OPTS="-Dhdp.version=2.4.0.0-169 -Dspark.executor.memory=1024m -Dspark.executor.instances=2 -Dspark.yarn.queue=default"

oluies / prepareData.scala

Last active March 9, 2016 21:23

	package com.combient.sparkjob.tedsds

	/**
	* Created by olu on 09/03/16.
	*/



	import org.apache.spark.ml.feature.{MinMaxScaler, VectorAssembler}
	import org.apache.spark.sql.{Column, SaveMode, SQLContext, GroupedData}

oluies / application.conf

Last active March 10, 2016 07:46

	include "application"

	manager {
	notebooks {
	dir = "/usr/share/spark-notebook/notebooks"
	override {
	sparkConf = {
	spark.driver.extraJavaOptions: "-Dhdp.version=2.4.0.0-169",
	spark.yarn.am.extraJavaOptions: "-Dhdp.version=2.4.0.0-169"
	}

oluies / spark-notebook

Last active March 10, 2016 07:49

	#!/usr/bin/env bash

	### ------------------------------- ###
	### Helper methods for BASH scripts ###
	### ------------------------------- ###

	die() {
	echo "$@" 1>&2
	exit 1
	}

oluies / application.log

Created March 10, 2016 07:56

	2016-02-21 22:16:19,395 - [INFO] - from play in main
	Application started (Prod)

	2016-02-21 22:16:19,522 - [INFO] - from play in main
	Listening for HTTP on /0:0:0:0:0:0:0:0:9000

	2016-02-21 22:19:12,913 - [DEBUG] - from application in New I/O worker #1
	Notebooks directory in the config is referring ./notebooks. Does it exist? true

	2016-02-21 22:19:13,037 - [INFO] - from application in New I/O worker #1

oluies / coalesce

Created March 12, 2016 07:27

	//read each of your input directory as a dataframe and union them and repartition it to the # of files you want and dump it back
	val dfSeq = MutableList[DataFrame]()

	sourceDirsToConsolidate.map(dir => {
	val df = sqlContext.parquetFile(dir)
	dfSeq += df
	})

	val masterDf = dfSeq.reduce((df1, df2) => df1.unionAll(df2))
	masterDf.coalesce(numOutputFiles).write.mode(saveMode).parquet(destDir)

Örjan Angré (Lundberg) oluies