dgadiraju’s gists

dgadiraju / scala-spark-topNProducts.scala

Last active December 15, 2017 22:02

	val l = ("Bike & Skate Shop", Iterable("933,42,Nike VR_S Covert Driver,,179.99,http://images.acmesports.sports/Nike+VR_S+Covert+Driver",
	"934,42,Callaway X Hot Driver,,0.0,http://images.acmesports.sports/Callaway+X+Hot+Driver",
	"935,42,TaylorMade RocketBallz Stage 2 Driver,,169.99,http://images.acmesports.sports/TaylorMade+RocketBallz+Stage+2+Driver",
	"936,42,Cleveland Golf Classic XL Driver,,119.99,http://images.acmesports.sports/Cleveland+Golf+Classic+XL+Driver",
	"937,42,Cobra AMP Cell Driver - Orange,,169.99,http://images.acmesports.sports/Cobra+AMP+Cell+Driver+-+Orange"))

	def topNProducts(rec: (String, Iterable[String]), topN: Int): Iterable[(String, String)] = {
	rec._2.toList.sortBy(k => -k.split(",")(4).toFloat).take(topN).map(r => (rec._1, r))
	}

dgadiraju / scala-spark-topNPricedProducts.scala

Last active May 21, 2017 07:30

	val l = ("Bike & Skate Shop", Iterable("933,42,Nike VR_S Covert Driver,,179.99,http://images.acmesports.sports/Nike+VR_S+Covert+Driver",
	"934,42,Callaway X Hot Driver,,0.0,http://images.acmesports.sports/Callaway+X+Hot+Driver",
	"935,42,TaylorMade RocketBallz Stage 2 Driver,,169.99,http://images.acmesports.sports/TaylorMade+RocketBallz+Stage+2+Driver",
	"936,42,Cleveland Golf Classic XL Driver,,119.99,http://images.acmesports.sports/Cleveland+Golf+Classic+XL+Driver",
	"937,42,Cobra AMP Cell Driver - Orange,,169.99,http://images.acmesports.sports/Cobra+AMP+Cell+Driver+-+Orange"))

	def topNPricedProducts(rec: (String, Iterable[String]), topN: Int): Iterable[(String, String)] = {
	val list = rec._2.toList
	val topNPrices = list.
	map(rec => rec.split(",")(4).toFloat).

dgadiraju / scala-spark-file-formats.scala

Last active March 4, 2018 16:28

	//This is just a script not a program
	//Execute these things as part of Spark Shell

	//Writing as sequence file
	import org.apache.hadoop.io._
	val products = sc.textFile("/public/retail_db/products")
	products.map(rec => (NullWritable.get(), rec)).
	saveAsSequenceFile("/user/dgadiraju/products_seq")

	//Reading sequnce files

dgadiraju / pyspark-topnproducts.py

Created March 14, 2017 05:54

	def topNProducts(rec, topN):
	x = [ ]
	x = list(sorted(rec, key=lambda k: float(k.split(",")[4]), reverse=True))
	import itertools
	return (y for y in list(itertools.islice(x, 0, topN)))

	products = sc.textFile("/public/retail_db/products")
	productsFiltered = products.filter(lambda rec: rec.split(",")[4] != "")

	for i in productsFiltered.\

dgadiraju / pyspark-topnpricesproducts.py

Created March 14, 2017 05:56

	def getTopDenseN(rec, topN):
	topNPricedProducts = [ ]
	topNPrices = [ ]
	prodPrices = [ ]
	prodPricesDesc = [ ]
	#10 records in rec
	for i in rec:
	prodPrices.append(float(i.split(",")[4]))
	#prodPrices will have only prices from the 10 records
	prodPricesDesc = list(sorted(set(prodPrices), reverse=True))

dgadiraju / flume-logger-hdfs.conf

Last active July 11, 2019 08:44

	# flume-logger-hdfs.conf: Read data from logs and write it to both logger and hdfs
	# flume command to start the agent - flume-ng agent --name a1 --conf /home/dgadiraju/flume_example/example.conf --conf-file example.conf

	# Name the components on this agent
	a1.sources = logsource
	a1.sinks = loggersink hdfssink
	a1.channels = loggerchannel hdfschannel

	# Describe/configure the source
	a1.sources.logsource.type = exec

dgadiraju / kafka-getting-started-on-itversity-labs.sh

Last active November 11, 2022 14:08

	# Create topics
	kafka-topics.sh --create \
	--zookeeper m01.itversity.com:2181,m02.itversity.com:2181,w01.itversity.com:2181 \
	--replication-factor 1 \
	--partitions 1 \
	--topic kafkadg

	# List all topics
	kafka-topics.sh --list \
	--zookeeper m01.itversity.com:2181,m02.itversity.com:2181,w01.itversity.com:2181

dgadiraju / flume-to-kafka.conf

Last active July 16, 2017 15:33

	# kandf.conf: Flume and Kafka integration
	# Read streaming data from logs and push it to Kafka as sink

	# Name the components on this agent
	kandf.sources = logsource
	kandf.sinks = ksink
	kandf.channels = mchannel

	# Describe/configure the source
	kandf.sources.logsource.type = exec

dgadiraju / SparkStreamingWordCount.scala

Last active March 4, 2018 16:29

	/**
	* Created by itversity on 17/03/17.
	* This is primarily to get the word count on the data received from
	* nc -lk 19999
	* Make sure build.sbt is updated with the dependency -
	* libraryDependencies += "org.apache.spark" % "spark-streaming_2.10" % "1.6.2"
	* Create jar, ship the jar, start nc, and then use spark-submit
	* spark-submit --class SparkStreamingWordCount --master yarn --conf spark.ui.port=14562 retail_2.10-1.0.jar
	*/
	import org.apache.spark.SparkConf

dgadiraju / FlumeAndSparkStreaming-example.scala

Created March 20, 2017 06:24

Durga Gadiraju dgadiraju