nikkisharma536’s gists

nikkisharma536 / emr_utils.py

Created January 14, 2019 09:32

ETL project - EMR utility function

	from common.system_utils import read_file,generate_uuid
	from common.s3_utils import upload_file, delete_file
	from common.ssh_utils import execute_remote


	def run_on_hive(script_path, run_date, pem_file_path, emr_master_ip, emr_username='hadoop'):
	content = read_file(script_path)

	# Generate random S3 path for temp scripts
	s3_path = 's3://nikita-ds-playground/scripts/' + generate_uuid() + '.hql'

nikkisharma536 / generate_data.py

Created January 14, 2019 09:33

ETL project - generate log data

	import os
	from common.s3_utils import copy_to_s3
	from common.system_utils import execute_local, del_local_file
	import datetime


	def generate_data():
	path = "/Users/nikki/work/code/Fake-Apache-Log-Generator/"
	os.chdir(path)

nikkisharma536 / incremental_script__clean_data.sql

Created January 14, 2019 09:35

ETL project - SQL script for clean table

	-- Creating a ETL script by passing run_date value

	-- HIVE CONF
	set hive.exec.dynamic.partition.mode=nonstrict;

	-- external parameters
	-- set RUN_DATE=2019-01-05;
	-- USAGE:
	-- hive -hiveconf RUN_DATE='2019-01-05' -f script.hql

nikkisharma536 / clean_data_job.py

Created January 14, 2019 09:37

ETL project - clean data job

	#1. Read the script file
	#2. Generate temp s3 path with uuid generator
	#3. save the script file on s3 path
	#4. ssh and execute hive script
	#5. delete hive script from s3

	import os
	import datetime
	from common.emr_utils import run_on_hive
	from config import key_path, ip

nikkisharma536 / summary_data_job.py

Created January 14, 2019 09:39

ETL project - summary data job

	import os
	import datetime
	from common.emr_utils import run_on_hive
	from config import key_path, ip


	if __name__ == '__main__' :
	dir_path = os.getcwd()
	file_path = os.path.join(dir_path, "./scripts/summary.hql")

nikkisharma536 / incremental_script__summary_data.sql

Created January 14, 2019 09:41

ETL project - SQL script of summary table


	-- Summary table having URI and dte

	-- HIVE CONF
	set hive.exec.dynamic.partition.mode=nonstrict;

	-- external parameters
	-- set RUN_DATE=2019-01-05;
	-- USAGE:
	-- hive -hiveconf RUN_DATE='2019-01-05' -f summary.hql

nikkisharma536 / config.py

Created January 15, 2019 09:46

ETL project : configuration file

	# We will use this file for all the
	# paths and configs that we will use
	# across our project


	# Add local path to the PEM file to login to EMR
	# Usually the paths to pem should not be exposed
	key_path = '/tmp/path/to/key.pem'

	# Add IP Address of EMR cluster

nikkisharma536 / regex_script.sql

Last active January 15, 2019 23:03

ETL project - Regex script

	CREATE EXTERNAL TABLE nik.regex_access_log (
	ip STRING,
	dte STRING,
	request_type STRING,
	uri STRING,
	protocol STRING,
	status STRING,
	bytes_sent STRING,
	referer STRING,
	useragent STRING

nikkisharma536 / upload_data.py

Created January 29, 2019 04:30

upload dataset

	# import libraries
	import numpy as np
	import tensorflow as tf
	import pandas as pd

	# upload train and test data
	train = pd.read_csv('/users/nikki/Downloads/all (1)/train.csv')
	test = pd.read_csv('/users/nikki/Downloads/all (1)/test.csv')

	target = train["label"]

nikkisharma536 / train_test.py

Created January 29, 2019 04:33

Create a train/test set

	#Create a train/test set
	from sklearn.model_selection import train_test_split
	X_train, X_test, y_train, y_test = train_test_split(X_train,target, test_size=0.2, random_state=42)
	y_train = np.asarray(y_train, dtype=np.int32)
	y_test =np.asarray(y_test, dtype=np.int32)
	batch_size =len(X_train)
	print(X_train.shape, y_train.shape,y_test.shape, test.shape )