Skip to content

Instantly share code, notes, and snippets.

View joshisa's full-sized avatar
🎯
Focusing

Sanjay Joshi joshisa

🎯
Focusing
View GitHub Profile
Python Notebooks
import os
import sys
import subprocess
masterCores = !cat /proc/cpuinfo | grep cores | tail -1 | tail -c 2
masterCores = int(masterCores[0])
print("Spark Context Version: " + sc.version);
print ("Spark Scala Version: " + os.environ['SPARK_SCALA_VERSION'])
print("Python Release version: " + sys.version)
@joshisa
joshisa / gist:5fdb4a021092110e57baecc111d09e2e
Created September 26, 2016 18:00
Example for adding spark-csv and stocator libraries to a pyspark (Python) Jupyter notebook
import os
SUBMIT_ARGS = "--packages com.databricks:spark-csv_2.10:1.5.0,com.ibm.stocator:stocator:1.0.5 pyspark-shell"
os.environ["PYSPARK_SUBMIT_ARGS"] = SUBMIT_ARGS
import pyspark
sc = pyspark.SparkContext('local[*]')
@joshisa
joshisa / jupyter_mem_fun.py
Last active October 28, 2016 19:27
Helps enumerate and summarize running notebooks on your Jupyter server
# Modified from http://stackoverflow.com/questions/34685825/jupyter-notebook-memory-usage-for-each-notebook
#
# Usage:
# Default Mode: Extrovert = False. Snoop only your own notebook instance
# df_mem_breakdown, df_mem_summary, ports = get_proc_info()
# df_mem_breakdown, df_mem_summary, ports = get_proc_info(False)
#
# Extrovert Mode: Snoop all of your sibling notebooks
# df_mem_breakdown, df_mem_summary, ports = get_proc_info(True)
#
/*
* This works really well with the dockerhub image @
* https://hub.docker.com/r/kamon/grafana_graphite/
* To build on Bluemix IBM Containers, simply execute this command
* cf ic cpi kamon/grafana_graphite registry.ng.bluemix.net/goodenough/grafana_graphite:new
*
* Be sure to expose the following public ports when creating the container
* 80/tcp, 81/tcp, 2003/tcp, 2004/tcp, 7002/tcp, 8125/udp, 8126/tcp
* https://github.com/kamon-io/docker-grafana-graphite/blob/master/graphite/carbon.conf#L31-#L37
*/
@joshisa
joshisa / shell.scala
Created November 2, 2016 19:00
One method for running shell commands in a Jupyter notebook
import sys.process._
"ls -al"!
def enableMonitoring(sc):
from pyspark import SparkContext, SparkConf
conf = SparkConf()
for pair in sc._conf.toDebugString().split('\n'):
confList = pair.split('=')
conf.set(confList[0],confList[1])
conf.setAppName("sparkmon")
conf.set("spark.metrics.conf.*.sink.graphite.class","org.apache.spark.metrics.sink.GraphiteSink")
conf.set("spark.metrics.conf.*.sink.graphite.host","Graphite_HOSTNAME")
conf.set("spark.metrics.conf.*.sink.graphite.port","2003")
//==================================================================
// SPARK INSTRUMENTATION
//==================================================================
import com.codahale.metrics.{MetricRegistry, Meter, Gauge}
import org.apache.spark.{SparkEnv, Accumulator}
import org.apache.spark.metrics.source.Source
import org.joda.time.DateTime
import scala.collection.mutable
/*
* This works really well with the dockerhub image @
* https://hub.docker.com/r/kamon/grafana_graphite/
* To build on Bluemix IBM Containers, simply execute this command
* cf ic cpi kamon/grafana_graphite registry.ng.bluemix.net/goodenough/grafana_graphite:new
*
* Be sure to expose the following public ports when creating the container
* 80/tcp, 81/tcp, 2003/tcp, 2004/tcp, 7002/tcp, 8125/udp, 8126/tcp
* https://github.com/kamon-io/docker-grafana-graphite/blob/master/graphite/carbon.conf#L31-#L37
*
/**
*
* Downloads Deps and Instantiates class for future parquet writing in DSX
*
*/
%AddDeps org.slf4j slf4j-simple 1.7.21 --transitive
%AddDeps org.slf4j slf4j-api 1.7.21 --transitive
Class.forName("org.slf4j.impl.StaticLoggerBinder")
val sparkConf = new SparkConf().setMaster("local").setAppName("text")
val sc = new SparkContext(sparkConf)
val hadoopConf = sc.hadoopConfiguration
//set the aws secret information
hadoopConf.set("fs.s3.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
hadoopConf.set("fs.s3n.awsAccessKeyId","youraccesskeyid")
hadoopConf.set("fs.s3n.awsSecretAccessKey","secretkey")