dgadiraju’s gists

dgadiraju / FlumeKafkaAndSparkStreaming-example.scala

Last active January 14, 2018 18:16

dgadiraju / hive_create_tables.hql

Created March 22, 2017 06:00

Make sure you save oozie-fork-join-workflow.xml as workflow.xml in a directory (let's say daily_revenue)

	use dgadiraju_oozie;
	drop table orders;
	drop table order_items;
	create external table orders (
	order_id int,
	order_date string,
	order_customer_id int,
	order_status string
	) row format delimited fields terminated by ','
	location '/user/dgadiraju/daily_revenue/orders';

dgadiraju / spark-scala-DailyRevenuePerDayPerDepartment.scala

Last active March 27, 2017 15:24

	package retail

	import com.typesafe.config.ConfigFactory
	import org.apache.hadoop.fs.{FileSystem, Path}
	import org.apache.spark.{SparkConf, SparkContext}

	/**
	* Created by itversity on 27/03/17.
	*/
	object DailyRevenuePerDayPerDepartment {

dgadiraju / spark-scala-TopNStocksByVolume.scala

Last active March 28, 2017 23:31

	package nyse

	import com.typesafe.config.ConfigFactory
	import org.apache.hadoop.fs.{FileSystem, Path}
	import org.apache.spark.{SparkConf, SparkContext}

	/**
	* Created by itversity on 28/03/17.
	*/
	object TopNStocksByVolume {

Created March 29, 2017 07:17

	package nyse

	import com.typesafe.config.ConfigFactory
	import org.apache.hadoop.fs.{FileSystem, Path}
	import org.apache.spark.{SparkConf, SparkContext}

	/**
	* Created by itversity on 28/03/17.
	*/
	object TopNStocksByVolumeWithName {

dgadiraju / hive-create-retail_db.sql

Created March 29, 2017 12:42

	-- first create database using this command
	-- create database <database_name>;
	-- Switch to database
	-- use <database_name>;
	-- create tables and load data using below commands
	-- Run show tables to list the tables
	create table departments (
	department_id int,
	department_name string
	) row format delimited fields terminated by ',';

dgadiraju / spark-scala-DailyRevenuePerDayPerDepartmentHive.scala

Created March 29, 2017 13:15

	package retail

	import com.typesafe.config.ConfigFactory
	import org.apache.hadoop.fs.{FileSystem, Path}
	import org.apache.spark.{SparkConf, SparkContext}
	import org.apache.spark.sql.hive.HiveContext

	/**
	* Created by itversity on 27/03/17.
	* build.sbt

dgadiraju / spark-scala-DailyRevenue.scala

Last active April 1, 2017 11:40

	package retail

	import com.typesafe.config.ConfigFactory
	import org.apache.hadoop.fs.{FileSystem, Path}
	import org.apache.spark.{SparkConf, SparkContext}

	/**
	* Created by itversity on 01/04/17.
	* problem statement:
	* Get the total revenue per day for all completed and closed orders

dgadiraju / spark-scala-TopNStocksByVolumeSQL.scala

Created April 5, 2017 04:07

	package nyse

	import com.typesafe.config.ConfigFactory
	import org.apache.hadoop.fs.{FileSystem, Path}
	import org.apache.spark.sql.SQLContext
	import org.apache.spark.{SparkConf, SparkContext}

	/**
	* Created by itversity on 30/03/17.
	*/

dgadiraju / flume-to-kafka-and-hdfs.conf

Last active June 9, 2017 09:31

	# fmp.conf: a multiplex agent to save one copy of data in HDFS and
	# other copy streamed to Kafka so that data can be processed by
	# streaming technologies such as Spark Streaming

	# Name the components on this agent
	fmp.sources = logsource
	fmp.sinks = kafkasink hdfssink
	fmp.channels = kafkachannel hdfschannel

	# Describe/configure the source

Durga Gadiraju dgadiraju