Skip to content

Instantly share code, notes, and snippets.

@nikkisharma536
nikkisharma536 / emr_utils.py
Created January 14, 2019 09:32
ETL project - EMR utility function
from common.system_utils import read_file,generate_uuid
from common.s3_utils import upload_file, delete_file
from common.ssh_utils import execute_remote
def run_on_hive(script_path, run_date, pem_file_path, emr_master_ip, emr_username='hadoop'):
content = read_file(script_path)
# Generate random S3 path for temp scripts
s3_path = 's3://nikita-ds-playground/scripts/' + generate_uuid() + '.hql'
@nikkisharma536
nikkisharma536 / generate_data.py
Created January 14, 2019 09:33
ETL project - generate log data
import os
from common.s3_utils import copy_to_s3
from common.system_utils import execute_local, del_local_file
import datetime
def generate_data():
path = "/Users/nikki/work/code/Fake-Apache-Log-Generator/"
os.chdir(path)
@nikkisharma536
nikkisharma536 / incremental_script__clean_data.sql
Created January 14, 2019 09:35
ETL project - SQL script for clean table
-- Creating a ETL script by passing run_date value
-- HIVE CONF
set hive.exec.dynamic.partition.mode=nonstrict;
-- external parameters
-- set RUN_DATE=2019-01-05;
-- USAGE:
-- hive -hiveconf RUN_DATE='2019-01-05' -f script.hql
@nikkisharma536
nikkisharma536 / clean_data_job.py
Created January 14, 2019 09:37
ETL project - clean data job
#1. Read the script file
#2. Generate temp s3 path with uuid generator
#3. save the script file on s3 path
#4. ssh and execute hive script
#5. delete hive script from s3
import os
import datetime
from common.emr_utils import run_on_hive
from config import key_path, ip
@nikkisharma536
nikkisharma536 / summary_data_job.py
Created January 14, 2019 09:39
ETL project - summary data job
import os
import datetime
from common.emr_utils import run_on_hive
from config import key_path, ip
if __name__ == '__main__' :
dir_path = os.getcwd()
file_path = os.path.join(dir_path, "./scripts/summary.hql")
@nikkisharma536
nikkisharma536 / incremental_script__summary_data.sql
Created January 14, 2019 09:41
ETL project - SQL script of summary table
-- Summary table having URI and dte
-- HIVE CONF
set hive.exec.dynamic.partition.mode=nonstrict;
-- external parameters
-- set RUN_DATE=2019-01-05;
-- USAGE:
-- hive -hiveconf RUN_DATE='2019-01-05' -f summary.hql
@nikkisharma536
nikkisharma536 / config.py
Created January 15, 2019 09:46
ETL project : configuration file
# We will use this file for all the
# paths and configs that we will use
# across our project
# Add local path to the PEM file to login to EMR
# Usually the paths to pem should not be exposed
key_path = '/tmp/path/to/key.pem'
# Add IP Address of EMR cluster
@nikkisharma536
nikkisharma536 / regex_script.sql
Last active January 15, 2019 23:03
ETL project - Regex script
CREATE EXTERNAL TABLE nik.regex_access_log (
ip STRING,
dte STRING,
request_type STRING,
uri STRING,
protocol STRING,
status STRING,
bytes_sent STRING,
referer STRING,
useragent STRING
@nikkisharma536
nikkisharma536 / upload_data.py
Created January 29, 2019 04:30
upload dataset
# import libraries
import numpy as np
import tensorflow as tf
import pandas as pd
# upload train and test data
train = pd.read_csv('/users/nikki/Downloads/all (1)/train.csv')
test = pd.read_csv('/users/nikki/Downloads/all (1)/test.csv')
target = train["label"]
@nikkisharma536
nikkisharma536 / train_test.py
Created January 29, 2019 04:33
Create a train/test set
#Create a train/test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train,target, test_size=0.2, random_state=42)
y_train = np.asarray(y_train, dtype=np.int32)
y_test =np.asarray(y_test, dtype=np.int32)
batch_size =len(X_train)
print(X_train.shape, y_train.shape,y_test.shape, test.shape )