Skip to content

Instantly share code, notes, and snippets.

SELECT job_id,app_name,nominal_time,missing_dependencies FROM COORD_ACTIONS
JOIN (SELECT id AS coord_id,app_name
FROM COORD_JOBS
WHERE bundle_id='0001994-130622061249977-oozie-oozi-B') AS coord_jobs
ON coord_id=job_id
WHERE status='WAITING'
ORDER BY app_name,nominal_time ASC;
#!/usr/bin/env bash
DATE="2013-06-19"
PATHS="/transform/sibyl/google/6291/Impression /input/sibyl/amp/6291/Impression"
for h in {1..23}; do
hour=`printf %02d00 $hour`
for basepath in $PATHS; do
path="${basepath}/${DATE}/${hour}"
hadoop fs -stat "${path}/_SUCCESS" || hadoop fs -touchz "${path}/_SUCCESS"
SET mapreduce.output.fileoutputformat.compress true;
SET mapreduce.output.fileoutputformat.compress.codec org.apache.hadoop.io.compress.SnappyCodec;
SET mapreduce.output.fileoutputformat.compress.type BLOCK;
T = LOAD '/logs/{lax1,nym1}/2013-05-23/**/urlus/*.snappy' USING PigStorage('\t') AS (chararray,chararray,chararray,chararray,chararray,chararray);
formatted = FOREACH T GENERATE REGEX_EXTRACT($0,'.+\\]:(.*)$',1) AS hostname, REGEX_EXTRACT($1,'(.*) UTC$', 1) AS tstamp,$2 AS incoming_url,$3 AS segments,$4 AS to_date,$5 AS url_match;
STORE formatted INTO '/tmp/urlus_for_netezza' USING PigStorage('\t');
chris@oj01-506:~$ hive -hiveconf hive.root.logger=ALL,console
13/05/16 18:01:51 WARN conf.Configuration: mapred.max.split.size is deprecated. Instead, use mapreduce.input.fileinputformat.split.maxsize
13/05/16 18:01:51 WARN conf.Configuration: mapred.min.split.size is deprecated. Instead, use mapreduce.input.fileinputformat.split.minsize
13/05/16 18:01:51 WARN conf.Configuration: mapred.min.split.size.per.rack is deprecated. Instead, use mapreduce.input.fileinputformat.split.minsize.per.rack
13/05/16 18:01:51 WARN conf.Configuration: mapred.min.split.size.per.node is deprecated. Instead, use mapreduce.input.fileinputformat.split.minsize.per.node
13/05/16 18:01:51 WARN conf.Configuration: mapred.reduce.tasks is deprecated. Instead, use mapreduce.job.reduces
13/05/16 18:01:51 WARN conf.Configuration: mapred.reduce.tasks.speculative.execution is deprecated. Instead, use mapreduce.reduce.speculative
13/05/16 18:01:51 WARN conf.Configuration: org.apache.hadoop.hive.conf.LoopingByteArrayInputStream@62acc57:an attem
CREATE EXTERNAL TABLE IF NOT EXISTS pythia_profilemod
PARTITIONED BY (year INT, month INT, day INT, hour INT)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
WITH SERDEPROPERTIES (
'avro.schema.url'='${SCHEMAPATH}')
STORED as INPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat';
#!/bin/bash
#
# Uses a keytab to auth to kerberos and then creates an empty file/directory
# to notify oozie that data is ready.
# Specify the name of the dataset to notify for as the first argument,
# and optionally the day in UTC during which the data is intended to be loaded
# (defaults to the current day in UTC)
# e.g. :
#
@andry1
andry1 / gist:5518946
Created May 4, 2013 22:13
Flume-OG to flume-ng mirroring config. The nullDeco in front of the rpcSink was the important part to make it work with flume-ng 1.3/CDH4.2.1
dc-ad-transaction-log-agent: syslogTcp(5141) | [value("dc", "dc") batch(20000, 15000) gzip ackedWriteAhead stubbornAppend insistentOpen < logicalSink("collector1") ? < logicalSink("collector20") ? logicalSink("collector19") > >, < nullDeco rpcSink("10.24.154.82",5141) ? nullDeco rpcSink("10.24.154.74",5141) >]; dc-ad-transaction-log-agent2: syslogTcp(5142) | [value("dc", "dc") batch(20000, 15000) gzip ackedWriteAhead stubbornAppend insistentOpen < logicalSink("collector11") ? < logicalSink("collector18") ? logicalSink("collector17") > >, < nullDeco rpcSink("10.24.154.74",5141) ? nullDeco rpcSink("10.24.154.78",5141) >]; dc-ad-transaction-log-agent3: syslogTcp(5143) | [value("dc", "dc") batch(20000, 15000) gzip ackedWriteAhead stubbornAppend insistentOpen < logicalSink("collector16") ? < logicalSink("collector15") ? logicalSink("collector14") > >, < nullDeco rpcSink("10.24.154.82",5142) ? nullDeco rpcSink("10.24.154.74",5142) >]; dc-ad-transaction-log-agent4: syslogTcp(5144) | [value("dc", "dc") batch(20000, 1
@andry1
andry1 / gist:5483334
Created April 29, 2013 17:45
Hive LastActivity UDF
package com.collective.hive;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyLongObjectInspector;
import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyPrimitiveObjectInspectorFactory;
@andry1
andry1 / gist:5152739
Created March 13, 2013 14:39
Example HTTPFS communication using curl+kerberos
#!/bin/bash
#
# Uses a keytab to auth to kerberos and then creates an empty file/directory
# to notify oozie that data is ready.
# Specify the name of the dataset to notify for as the first argument,
# and optionally the day in UTC during which the data is intended to be loaded
# (defaults to the current day in UTC)
# e.g. :
#
@andry1
andry1 / gist:3919831
Created October 19, 2012 18:29
Quasi Output
[ruby-1.9.3-p194] (master)
[02:19 PM] chris@apocalypse ~/src/collective/quasi/automati$ VBoxManage --version
4.1.23r80870
[02:20 PM] chris@apocalypse ~/src/collective/quasi/automati$ vagrant --version
Vagrant version 0.9.0
[ruby-1.9.3-p194] (master)
[02:13 PM] chris@apocalypse ~/src/collective/quasi$ git pull origin master
From github.com:collectivemedia/quasi
* branch master -> FETCH_HEAD