airawat · February 12, 2018 10:10
diff --git a/00-OozieCoordinatorJobWithFileAsTrigger b/00-OozieCoordinatorJobWithFileAsTrigger
 This gist includes components of a oozie (trigger file initiated) coordinator job - 
 scripts/code, sample data and commands;  Oozie actions covered: hdfs action, email action, 
 java main action, hive action;  Oozie controls covered: decision, fork-join; The workflow 
 includes a sub-workflow that runs two hive actions concurrently.  The hive table is 
 partitioned;  Parsing uses hive-regex serde, and Java-regex.  Also, the java mapper, gets 
 the input directory path and includes part of it in the key.
 
 Usecase
 -------
 Parse Syslog generated log files to generate reports;

 Pictorial overview of job:
 --------------------------
 http://hadooped.blogspot.com/p/ooziecooridnatorjobtrigfiledep-pix.html
 
 Includes:
 ---------
 Sample data and structure:           01-SampleDataAndStructure 
 Data and script download:            02-DataAndScriptDownload
 Data load commands:                  03-HdfsLoadCommands
 Java MR - Mapper code:               04A-MapperJavaCode
 Java MR - Reducer code:              04B-ReducerJavaCode
 Java MR - Driver code:               04C-DriverJavaCode
 Command to test Java MR program:     04D-CommandTestJavaMRProg
 Hive -create log table command       05A-HiveCreateTable
 Hive -load partitions                05B-HiveLoadPartitions
 Hive commands to test data loaded    05C-HiveDataLoadTestCommands
 Hive QL script for report 2          05D-HiveQLReport2
 Hive QL script for report 3          05E-HiveQLReport3
 Oozie configuration for email        06-OozieSMTPconfiguration
 Oozie coorindator properties file    07-OozieCoordinatorProperties
 Oozie cooridinator conf file         08-OozieCoordinatorXML
 Oozie workflow conf file             09-OozieWorkflowXML
 Oozie sub-workflow conf file         10-OozieSubWorkflowXML
 Oozie commands                       11-OozieJobExecutionCommands
 Output -Report1                      12A-Rpt1-JavaMainProgramOutput
 Output -Report2                      12B-Rpt2-HiveProgramOutputIssuesByMonth
 Output -Report3                      12C-Rpt3-HiveProgramOutputTop3Issues
 Oozie web console - screenshots      13-OozieWebConsoleScreenshots
diff --git a/01-SampleDataAndStructure b/01-SampleDataAndStructure
 Sample data
 ------------
 May  3 11:52:54 cdh-dn03 init: tty (/dev/tty6) main process (1208) killed by TERM signal
 May  3 11:53:31 cdh-dn03 kernel: registered taskstats version 1
 May  3 11:53:31 cdh-dn03 kernel: sr0: scsi3-mmc drive: 32x/32x xa/form2 tray
 May  3 11:53:31 cdh-dn03 kernel: piix4_smbus 0000:00:07.0: SMBus base address uninitialized - upgrade BIOS or use force_addr=0xaddr
 May  3 11:53:31 cdh-dn03 kernel: nf_conntrack version 0.5.0 (7972 buckets, 31888 max)
 May  3 11:53:57 cdh-dn03 kernel: hrtimer: interrupt took 11250457 ns
 May  3 11:53:59 cdh-dn03 ntpd_initres[1705]: host name not found: 0.rhel.pool.ntp.org
 
 Structure
 ----------
 Month   = May  
 Day     = 3 
 Time    = 11:52:54 
 Node    = cdh-dn03 
 Process = init: 
 Log msg = tty (/dev/tty6) main process (1208) killed by TERM signal
diff --git a/02-DataAndScriptDownload b/02-DataAndScriptDownload
 Data download
 -------------
 Github: 
 https://github.com/airawat/OozieSamples
 
 Email me at [email protected] if you encounter any issues
 
 
 Directory structure
 -------------------
 oozieProject
  data
      airawat-syslog
        <<Node-Name>>
            <<Year>>
                <<Month>>
                    messages

  sampleCoordinatorJobTrigFileDep
        triggerDir
            trigger.dat

        coordinatorConf/
            coordinator.properties
            coordinator.xml
 
        workflowApp
            workflow.xml
 
            hiveSubWorkflowApp
                hive-site.xml
                hiveConsolidated-Year-Month-Report.hql
                hiveTop3Processes-Year-Report.hql
                workflow.xml
        
            lib
                LogEventCount.jar
diff --git a/03-HdfsLoadCommands b/03-HdfsLoadCommands
 Hdfs load commands
 ------------------
 
 $ hadoop fs -mkdir oozieProject
 $ hadoop fs -put oozieProject/data oozieProject/
 $ hadoop fs -put oozieProject/sampleCoordinatorJobTrigFileDep oozieProject/
 
 Run command below to validate load against expected directory structure in section 02-DataAndScriptDownload
 $ hadoop fs -ls -R oozieProject/sampleCoordinatorJobTrigFileDep | awk '{print $8}'

 Remove the trigger file directory - we will load it when we want to execute the job
 $ hadoop fs -rm -R oozieProject/sampleCoordinatorJobTrigFileDep/triggerDir/
diff --git a/04A-MapperJavaCode b/04A-MapperJavaCode
 // Source code for Mapper
 //-----------------------------------------------------------
 // LogEventCountMapper.java
 //-----------------------------------------------------------
 // Java program that parses logs using regex
 // The program counts the number of processes logged by year.  
 // E.g. Key=2013-ntpd; Value=1;
 
 package Airawat.Oozie.Samples;
 
 import java.io.IOException;
 
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.lib.input.FileSplit;
 
 
 public class LogEventCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
 
  String strLogEntryPattern = "(\\w+)\\s+(\\d+)\\s+(\\d+:\\d+:\\d+)\\s+(\\w+\\W*\\w*)\\s+(.*?\\:)\\s+(.*$)";
  public static final int NUM_FIELDS = 6;
  Text strEvent = new Text("");
 
  @Override
  public void map(LongWritable key, Text value, Context context)
      throws IOException, InterruptedException {
 
        String strLogEntryLine = value.toString();
    Pattern objPtrn = Pattern.compile(strLogEntryPattern);
 
        Matcher objPatternMatcher = objPtrn.matcher(strLogEntryLine);
        if (!objPatternMatcher.matches() || NUM_FIELDS != objPatternMatcher.groupCount()) {
                System.err.println("Bad log entry (or problem with RE?):");
                System.err.println(strLogEntryLine);
                return;
        }
        /*
        System.out.println("Month_Name: " + objPatternMatcher.group(1));
        System.out.println("Day: " + objPatternMatcher.group(2));
        System.out.println("Time: " + objPatternMatcher.group(3));
        System.out.println("Node: " + objPatternMatcher.group(4));
        System.out.println("Process: " + objPatternMatcher.group(5));
        System.out.println("LogMessage: " + objPatternMatcher.group(6));
        */
        //Oh what a pretty chunk of code ;)
        strEvent.set(((FileSplit)context.getInputSplit()).getPath().toString().substring((((FileSplit)context.getInputSplit()).getPath().toString().length()-16), (((FileSplit)context.getInputSplit()).getPath().toString().length()-12)) + "-" + ((objPatternMatcher.group(5).toString().indexOf("[")) == -1 ? (objPatternMatcher.group(5).toString().substring(0,(objPatternMatcher.group(5).length()-1))) : (objPatternMatcher.group(5).toString().substring(0,(objPatternMatcher.group(5).toString().indexOf("["))))));
 
    context.write(strEvent, new IntWritable(1));
 
  }
 }
diff --git a/04B-ReducerJavaCode b/04B-ReducerJavaCode
 // Source code for reducer
 //--------------------------
 // LogEventCountReducer.java
 //--------------------------
 
 package Airawat.Oozie.Samples;
 import java.io.IOException;
 
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Reducer;
 
  
 public class LogEventCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
 
  @Override
  public void reduce(Text key, Iterable<IntWritable> values, Context context)
  		throws IOException, InterruptedException {
 		int intEventCount = 0;
 		
 		for (IntWritable value : values) {
 			intEventCount += value.get();
 		}
 		
 		context.write(key, new IntWritable(intEventCount));
 	}
 }
diff --git a/04C-DriverJavaCode b/04C-DriverJavaCode
 // Source code for reducer
 //--------------------------
 // LogEventCountReducer.java
 //--------------------------
 
 package Airawat.Oozie.Samples;
 
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 import org.apache.hadoop.mapreduce.Job;
 
 public class LogEventCount {
 
  public static void main(String[] args) throws Exception {
 
    if (args.length != 2) {
      System.out.printf(
          "Usage: Airawat.Oozie.Samples.LogEventCount <input dir> <output dir>\n");
      System.exit(-1);
    }
    
    //Instantiate a Job object for your job's configuration.  
    Job job = new Job();
      
    //Job jar file
    job.setJarByClass(LogEventCount.class);
    
    //Job name
    job.setJobName("Syslog Event Rollup");
 
    //Paths
    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
 
    //Mapper and reducer classes
    job.setMapperClass(LogEventCountMapper.class);
    job.setReducerClass(LogEventCountReducer.class);
 
    //Job's output key and value classes
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    
    //Number of reduce tasks
    job.setNumReduceTasks(3);
 
    //Start the MapReduce job, wait for it to finish.
    boolean success = job.waitForCompletion(true);
    System.exit(success ? 0 : 1);
  }
 }
diff --git a/04D-CommandTestJavaMRProg b/04D-CommandTestJavaMRProg
 Commands to test the java program
 ---------------------------------
 
 a) Command to run the program
 
 $ hadoop jar oozieProject/sampleCoordinatorJobTrigFileDep/workflowApp/lib/LogEventCount.jar Airawat.Oozie.Samples.LogEventCount "oozieProject/sampleCoordinatorJobTrigFileDep/data/*/*/*/*/*" "oozieProject/sampleCoordinatorJobTrigFileDep/myCLIOutput" 
 
 b) Command to view results
 
 $ hadoop fs -cat oozieProject/sampleCoordinatorJobTrigFileDep/myCLIOutput/part*
 
 c) Results
 
 2013-NetworkManager  7
 2013-console-kit-daemon  7
 2013-gnome-session	11
 2013-init	166
 2013-kernel	810
 2013-login	2
 2013-nm-dispatcher.action	4
 2013-ntpd_initres	4133
 2013-polkit-agent-helper-1	8
 2013-pulseaudio	18
 2013-spice-vdagent	15
 2013-sshd	6
 2013-sudo	8
 2013-udevd	6
 
diff --git a/05A-HiveCreateTable b/05A-HiveCreateTable
 Hive script to create table for logs
 -------------------------------------
 
 hive> 
    CREATE EXTERNAL TABLE SysLogEvents(
    month_name STRING,
    day STRING,
    time STRING,
    host STRING,
    event STRING,
    log STRING)
    PARTITIONED BY(node string,year int, month int)
    ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe' 
    WITH SERDEPROPERTIES (
    "input.regex" = "(\\w+)\\s+(\\d+)\\s+(\\d+:\\d+:\\d+)\\s+(\\w+\\W*\\w*)\\s+(.*?\\:)\\s+(.*$)"
    ) 
    stored as textfile;
diff --git a/05B-HiveLoadPartitions b/05B-HiveLoadPartitions
 Hive scripts to create and load partitions
 -------------------------------------------
 
 Note: Replace my user ID "akhanolk" with yours
 
 hive >
 
 Alter table SysLogEvents Add IF NOT EXISTS partition(node="cdh-dev01",year=2013, month=04)
     location '/user/akhanolk/oozieProject/data/airawat-syslog/cdh-dev01/2013/04/';
 Alter table SysLogEvents Add IF NOT EXISTS partition(node="cdh-dev01",year=2013, month=05)
     location '/user/akhanolk/oozieProject/data/airawat-syslog/cdh-dev01/2013/05/';
 
 Alter table SysLogEvents Add IF NOT EXISTS partition(node="cdh-dn01",year=2013, month=05)
     location '/user/akhanolk/oozieProject/data/airawat-syslog/cdh-dn01/2013/05/';
 
 Alter table SysLogEvents Add IF NOT EXISTS partition(node="cdh-dn02",year=2013, month=04)
     location '/user/akhanolk/oozieProject/data/airawat-syslog/cdh-dn02/2013/04/';
 Alter table SysLogEvents Add IF NOT EXISTS partition(node="cdh-dn02",year=2013, month=05)
     location '/user/akhanolk/oozieProject/data/airawat-syslog/cdh-dn02/2013/05/';
 
 Alter table SysLogEvents Add IF NOT EXISTS partition(node="cdh-dn03",year=2013, month=04)
     location '/user/akhanolk/oozieProject/data/airawat-syslog/cdh-dn03/2013/04/';
 Alter table SysLogEvents Add IF NOT EXISTS partition(node="cdh-dn03",year=2013, month=05)
     location '/user/akhanolk/oozieProject/data/airawat-syslog/cdh-dn03/2013/05/';
 
 Alter table SysLogEvents Add IF NOT EXISTS partition(node="cdh-jt01",year=2013, month=04)
     location '/user/akhanolk/oozieProject/data/airawat-syslog/cdh-jt01/2013/04/';
 Alter table SysLogEvents Add IF NOT EXISTS partition(node="cdh-jt01",year=2013, month=05)
     location '/user/akhanolk/oozieProject/data/airawat-syslog/cdh-jt01/2013/05/';
 
 Alter table SysLogEvents Add IF NOT EXISTS partition(node="cdh-nn01",year=2013, month=05)
     location '/user/akhanolk/oozieProject/data/airawat-syslog/cdh-nn01/2013/05/';
 
 Alter table SysLogEvents Add IF NOT EXISTS partition(node="cdh-vms",year=2013, month=05)
     location '/user/akhanolk/oozieProject/data/airawat-syslog/cdh-vms/2013/05/';
diff --git a/05C-HiveDataLoadTestCommands b/05C-HiveDataLoadTestCommands
 Hive ql to test data loaded
 ----------------------------
 
 hive>
    --Print headers
    set hive.cli.print.header=true;
 
    --Need to add this jar for MR to work..your env may not need it
    add jar hadoop-lib/hive-contrib-0.10.0-cdh4.2.0.jar; 
 
    --Sample query
    select * from SysLogEvents limit 2;
 
diff --git a/05D-HiveQLReport2 b/05D-HiveQLReport2
 --Hive QL script: Generates report
 --File name: hiveConsolidated-Year-Month-Report.hql
 ---------------------------------------------------
 
 use default;
 drop table consolidated_YM_report;
 
 CREATE TABLE IF NOT EXISTS consolidated_YM_report(
     process string,
     node string,
     year int,
     month int,
     occurrence int)
     ROW FORMAT DELIMITED 
     FIELDS TERMINATED by ','
     LINES TERMINATED by '\n';
 
 INSERT OVERWRITE TABLE consolidated_YM_report
 select case locate('[',event,1) when 0 then case locate(':',event,1) when 0 then event else substr(event,1,(locate(':',event,1))-1)  end 
 else substr(event,1,(locate('[',event,1))-1)  end process,Node,Year,Month,Count(*) Occurrence from SysLogEvents group by node,year,month, case locate('[',event,1) when 0 then case locate(
 ':',event,1) when 0 then event else substr(event,1,(locate(':',event,1))-1)  end else substr(event,1,(locate('[',event,1))-1)  end order by process asc,node asc,year,month;
diff --git a/05E-HiveQLReport3 b/05E-HiveQLReport3
 --Hive QL script: Generates report
 --File name: hiveTop3Processes-Year-Report.hql
 ---------------------------------------------------
 
 use default;
 drop table top3_process_by_year_report;
 
 CREATE TABLE IF NOT EXISTS top3_process_by_year_report(
     process string,
     year int,
     occurrence int)
     ROW FORMAT DELIMITED
     FIELDS TERMINATED by ','
     LINES TERMINATED by '\n';
 
 INSERT OVERWRITE TABLE top3_process_by_year_report
 select process, year, occurrence from (select case locate('[',event,1) when 0 then case locate(':',event,1) when 0 then event else substr(event,1,(locate(':',event,1))-1)  end else substr
 (event,1,(locate('[',event,1))-1)  end process,Year,Count(*) Occurrence from SysLogEvents 
 group by year,case locate('[',event,1) when 0 then case locate(':',event,1) when 0 then event else substr(event,1,(locate(':',event,1))-1)  end else substr(event,1,(locate('[',event,1))-1
 )  end order by process asc,year,Occurrence desc) X where process is not null order by occurrence desc limit 3;
diff --git a/06-OozieSMTPconfiguration b/06-OozieSMTPconfiguration
 Oozie SMTP configuration
 ------------------------
 The following needs to be added to oozie-site.xml - after updating per your environment and configuration;
 
 
 <!-- SMTP params-->
 <property>
  <name>oozie.email.smtp.host</name>
  <value>cdh-dev01</value>
 </property>
 <property>
  <name>oozie.email.smtp.port</name>
  <value>25</value>
 </property>
 <property>
  <name>oozie.email.from.address</name>
  <value>oozie@cdh-dev01</value>
 </property>
 <property>
  <name>oozie.email.smtp.auth</name>
  <value>false</value>
 </property>
 <property>
  <name>oozie.email.smtp.username</name>
  <value></value>
 </property>
 <property>
  <name>oozie.email.smtp.password</name>
  <value></value>
 </property
diff --git a/07-OozieCoordinatorProperties b/07-OozieCoordinatorProperties
 #------------------------------------------------------------
 # Oozie coordinator properties file
 # Filename: cordinator.properties
 #------------------------------------------------------------
 
 #Coordinator job properties file

 nameNode=hdfs://cdh-nn01.chuntikhadoop.com:8020
 jobTracker=cdh-jt01:8021
 queueName=default

 oozieProjectRoot=${nameNode}/user/${user.name}/oozieProject
 appRoot=${oozieProjectRoot}/sampleCoordinatorJobTrigFileDep
 oozie.coord.application.path=${appRoot}/coordinatorConf
 workflowAppPath=${appRoot}/workflowApp
 subWorkflowAppPath=${workflowAppPath}/hiveSubWorkflowApp

 oozie.libpath=${nameNode}/user/oozie/share/lib
 oozie.use.system.libpath=true
 oozie.wf.rerun.failnodes=true

 inputDir=${oozieProjectRoot}/data/*/*/*/*/*
 outputDirJavaMain=${appRoot}/output-JavaMain
 triggerFileDir=${appRoot}/triggerDir

 toEmailAddress=akhanolk@cdh-dev01
 startTime=2013-07-09T15:55Z
 endTime=2013-07-09T015:57Z
 timeZoneDef=UTC

 inputDirRecordCount=`cat ${inputDir} | wc -l`
 minRequiredRecordCount=1

diff --git a/08-OozieCoordinatorXML b/08-OozieCoordinatorXML
 <!------------------------------------------>
 <!--Coordinator xml file: coordinator.xml -->
 <!------------------------------------------>
 
 <coordinator-app name="AirawatCoordJobTrigDep" 
  	 frequency="${coord:days(1)}"
                 start="${startTime}" 
 		 end="${endTime}" 
 		 timezone="${timeZoneDef}"
                 xmlns="uri:oozie:coordinator:0.1"
 		 xmlns:sla="uri:oozie:sla:0.1">
       	<controls>
                <timeout>20</timeout>
                <concurrency>6</concurrency>
                <execution>FIFO</execution>
        </controls>
 	 <datasets>
                <dataset name="inputDS" frequency="${coord:days(1)}" initial-instance="${startTime}" timezone="${timeZoneDef}">
                        <uri-template>${triggerFileDir}</uri-template>
 			<done-flag>trigger.dat</done-flag>
                </dataset>
        </datasets>
        <input-events>
                <data-in name="AirawatCoordTrigDepInput" dataset="inputDS">
                        <instance>${startTime}</instance>
                </data-in>
        </input-events>
        <action>
                <workflow>
                        <app-path>${workflowAppPath}</app-path>
                </workflow>
        </action>     
 </coordinator-app>
diff --git a/09-OozieWorkflowXML b/09-OozieWorkflowXML
 <!------------------------------------------>
 <!--Workflow xml file: workflow.xml -->
 <!------------------------------------------>

 <workflow-app name="AirawatSampleCoordJob-Parent" xmlns="uri:oozie:workflow:0.1">
  <start to="inputAvailableCheckDecision"/>
        <decision name="inputAvailableCheckDecision">
                <switch>
                        <case to="startTaskFork">
                                ${inputDirRecordCount gt minRequiredRecordCount}
                        </case>
                        <default to="end"/>
                </switch>
        </decision>
 	<fork name="startTaskFork">
                <path start="javaMainAction"/>
                <path start="hiveSubWorkflow"/>
        </fork>
 	<action name="javaMainAction">
                <java>
                        <job-tracker>${jobTracker}</job-tracker>
                        <name-node>${nameNode}</name-node>
                        <prepare>
                                <delete path="${outputDirJavaMain}"/>
                        </prepare>
                        <configuration>
                                <property>
                                        <name>mapred.job.queue.name</name>
                                        <value>${queueName}</value>
                                </property>
                        </configuration>
                        <main-class>Airawat.Oozie.Samples.LogEventCount</main-class>
                        <arg>${inputDir}</arg>
                        <arg>${outputDirJavaMain}</arg>
                </java>
                <ok to="joiningControl-P"/>
                <error to="sendErrorEmail"/>
        </action>
        <action name='hiveSubWorkflow'>
        	<sub-workflow>
            		<app-path>${subWorkflowAppPath}</app-path>
                <propagate-configuration/>
  	</sub-workflow>
 		<ok to="joiningControl-P" />
        	<error to="sendErrorEmail" />
 	</action>
 	<join name="joiningControl-P" to="cleanUp"/>
 	<action name='cleanUp'>
                <fs>
 			<delete path="${triggerFileDir}"/>
                </fs>
 		<ok to="end" />
                <error to="sendErrorEmail" />
        </action>
    	<action name="sendErrorEmail">
    		<email xmlns="uri:oozie:email-action:0.1">
                        <to>${toEmailAddress}</to>
                        <subject>Status of workflow ${wf:id()}</subject>
                        <body>The workflow ${wf:name()} with id -${wf:id()}, had issues and will be killed;  The error logged is: ${wf:errorMessage(wf:lastErrorNode());}</body>
                </email>
                <ok to="killJobAction"/>
                <error to="killJobAction"/>
 	</action>
 	<kill name="killJobAction">
                <message>"Killed job due to error: ${wf:errorMessage(wf:lastErrorNode())}"</message>
        </kill>
       	<end name="end" />
 </workflow-app>

diff --git a/10-OozieSubWorkflowXML b/10-OozieSubWorkflowXML
 <!------------------------------------------------->
 <!--Sub-workflow xml file: workflow.xml ----------->
 <!------------------------------------------------->
 
 <workflow-app name="AirawatSampleCoordJob-Child" xmlns="uri:oozie:workflow:0.1">
    <start to="startConcurrentHiveTasksFork"/>
    <fork name="startConcurrentHiveTasksFork">
        <path start="hiveActionIssuesByYM"/>
        <path start="hiveActionTop3Issues"/>
    </fork>
    <action name="hiveActionIssuesByYM">
  		  <hive xmlns="uri:oozie:hive-action:0.2">
            <job-tracker>${jobTracker}</job-tracker>
            <name-node>${nameNode}</name-node>
   			    <job-xml>${subWorkflowAppPath}/hive-site.xml</job-xml>
   			    <script>${subWorkflowAppPath}/hiveConsolidated-Year-Month-Report.hql</script>
        </hive>
        <ok to="joiningControl-C"/>
        <error to="sendErrorEmail"/>
    </action>
 	  <action name="hiveActionTop3Issues">
        <hive xmlns="uri:oozie:hive-action:0.2">
            <job-tracker>${jobTracker}</job-tracker>
            <name-node>${nameNode}</name-node>
            <job-xml>${subWorkflowAppPath}/hive-site.xml</job-xml>
            <script>${subWorkflowAppPath}/hiveTop3Processes-Year-Report.hql</script>
        </hive>
        <ok to="joiningControl-C"/>
        <error to="sendErrorEmail"/>
    </action>
    <kill name="killJobAction">
        <message>"Killed job due to error: ${wf:errorMessage(wf:lastErrorNode())}"</message>
    </kill>
 	  <action name="sendErrorEmail">
        <email xmlns="uri:oozie:email-action:0.1">
            <to>${toEmailAddress}</to>
            <subject>Status of child workflow ${wf:id()}</subject>
            <body>The workflow ${wf:id()} had issues and will be killed;;  The error logged is: ${wf:errorMessage(wf:lastErrorNode())}</body>
        </email>
        <ok to="killJobAction"/>
        <error to="killJobAction"/>
    </action>
    <join name="joiningControl-C" to="end"/>
    <end name="end" />
 </workflow-app>
diff --git a/11-OozieJobExecutionCommands b/11-OozieJobExecutionCommands
 Executing the oozie cooridnator job
 ------------------------------------
 
 Step 1) Modify coordinator.properties file
 
 Set the start and end time to be in the future, UTC, so you can see how the job is in waiting state prior to start time condition being met; The following are the entries that need to be changed.
 
 startTime=2013-07-09T03:45Z
 endTime=2013-07-09T03:47Z
 
 Step 2) Submit the coordinator job
 
 $ oozie job -oozie http://cdh-dev01:11000/oozie -config oozieProject/sampleCoordinatorJobTrigFileDep/coordinatorConf/coordinator.properties -run

 Step 3) Publish trigger file to run job

 $ hadoop fs -put oozieProject/sampleCoordinatorJobTrigFileDep/triggerDir oozieProject/sampleCoordinatorJobTrigFileDep


 Replace cdh-dev01 with your oozie server, and 11000 with the associated port number;
diff --git a/12A-Rpt1-JavaMainProgramOutput b/12A-Rpt1-JavaMainProgramOutput
 Output of java program:
 ------------------------
 
 $ hadoop fs -ls -R oozieProject/sampleCoordinatorJobTrigFileDep/out*/part* | awk '{print $8}' | xargs hadoop fs -cat
 2013-NetworkManager  7
 2013-console-kit-daemon  7
 2013-gnome-session	11
 2013-init	166
 2013-kernel	810
 2013-login	2
 2013-nm-dispatcher.action	4
 2013-ntpd_initres	4133
 2013-polkit-agent-helper-1	8
 2013-pulseaudio	18
 2013-spice-vdagent	15
 2013-sshd	6
 2013-sudo	8
 2013-udevd	6
diff --git a/12B-Rpt2-HiveProgramOutputIssuesByMonth b/12B-Rpt2-HiveProgramOutputIssuesByMonth
 Results of report 2, from execution of hiveConsolidated-Year-Month-Report.hql
 ------------------------------------------------------------------------------
 
 hive>
  set hive.cli.print.header=true;
 
 hive> select * from consolidated_YM_report;
 OK
 process  node  year	month	occurrence
 NULL	cdh-dev01	2013	5	19
 NULL	cdh-vms	2013	5	6
 NetworkManager	cdh-dev01	2013	5	7
 console-kit-daemon	cdh-dev01	2013	5	7
 gnome-session	cdh-dev01	2013	5	11
 init	cdh-dev01	2013	5	38
 init	cdh-dn01	2013	5	17
 init	cdh-dn02	2013	5	17
 init	cdh-dn03	2013	5	23
 init	cdh-jt01	2013	5	17
 init	cdh-nn01	2013	5	29
 init	cdh-vms	2013	5	25
 kernel	cdh-dev01	2013	5	203
 kernel	cdh-dn01	2013	5	67
 kernel	cdh-dn02	2013	5	58
 kernel	cdh-dn03	2013	5	58
 kernel	cdh-jt01	2013	5	76
 kernel	cdh-nn01	2013	5	172
 kernel	cdh-vms	2013	5	176
 login	cdh-vms	2013	5	2
 nm-dispatcher.action	cdh-dev01	2013	5	4
 ntpd_initres	cdh-dev01	2013	5	57
 ntpd_initres	cdh-dn01	2013	5	803
 ntpd_initres	cdh-dn02	2013	5	804
 ntpd_initres	cdh-dn03	2013	5	792
 ntpd_initres	cdh-jt01	2013	5	804
 ntpd_initres	cdh-nn01	2013	5	834
 ntpd_initres	cdh-vms	2013	5	39
 polkit-agent-helper-1	cdh-dev01	2013	5	8
 pulseaudio	cdh-dev01	2013	4	1
 pulseaudio	cdh-dev01	2013	5	17
 spice-vdagent	cdh-dev01	2013	4	1
 spice-vdagent	cdh-dev01	2013	5	14
 sshd	cdh-dev01	2013	5	6
 sudo	cdh-dn02	2013	4	1
 sudo	cdh-dn02	2013	5	1
 sudo	cdh-dn03	2013	4	1
 sudo	cdh-dn03	2013	5	1
 sudo	cdh-jt01	2013	4	3
 sudo	cdh-jt01	2013	5	1
 udevd	cdh-dn01	2013	5	1
 udevd	cdh-dn02	2013	5	1
 udevd	cdh-dn03	2013	5	1
 udevd	cdh-jt01	2013	5	1
 udevd	cdh-vms	2013	5	2
 Time taken: 5.841 seconds
diff --git a/12C-Rpt3-HiveProgramOutputTop3Issues b/12C-Rpt3-HiveProgramOutputTop3Issues
 Results of report 3, from execution of hiveTop3Processes-Year-Report.hql
 ------------------------------------------------------------------------
 --Get top3 issues logged by year
 
 hive>
  set hive.cli.print.header=true;
 
 hive>
  select * from top3_process_by_year_report;
 
 process  year  occurrence
 ntpd_initres	2013	4133
 kernel	2013	810
 init	2013	166
 Time taken: 0.385 seconds
diff --git a/13-OozieWebConsoleScreenshots b/13-OozieWebConsoleScreenshots
 http://hadooped.blogspot.com/p/ooziecooridnatorjobtrigfiledep-pix-oozie.html
	This gist includes components of a oozie (trigger file initiated) coordinator job -
	scripts/code, sample data and commands; Oozie actions covered: hdfs action, email action,
	java main action, hive action; Oozie controls covered: decision, fork-join; The workflow
	includes a sub-workflow that runs two hive actions concurrently. The hive table is
	partitioned; Parsing uses hive-regex serde, and Java-regex. Also, the java mapper, gets
	the input directory path and includes part of it in the key.

	Usecase
	-------
	Parse Syslog generated log files to generate reports;

	Pictorial overview of job:
	--------------------------
	http://hadooped.blogspot.com/p/ooziecooridnatorjobtrigfiledep-pix.html

	Includes:
	---------
	Sample data and structure: 01-SampleDataAndStructure
	Data and script download: 02-DataAndScriptDownload
	Data load commands: 03-HdfsLoadCommands
	Java MR - Mapper code: 04A-MapperJavaCode
	Java MR - Reducer code: 04B-ReducerJavaCode
	Java MR - Driver code: 04C-DriverJavaCode
	Command to test Java MR program: 04D-CommandTestJavaMRProg
	Hive -create log table command 05A-HiveCreateTable
	Hive -load partitions 05B-HiveLoadPartitions
	Hive commands to test data loaded 05C-HiveDataLoadTestCommands
	Hive QL script for report 2 05D-HiveQLReport2
	Hive QL script for report 3 05E-HiveQLReport3
	Oozie configuration for email 06-OozieSMTPconfiguration
	Oozie coorindator properties file 07-OozieCoordinatorProperties
	Oozie cooridinator conf file 08-OozieCoordinatorXML
	Oozie workflow conf file 09-OozieWorkflowXML
	Oozie sub-workflow conf file 10-OozieSubWorkflowXML
	Oozie commands 11-OozieJobExecutionCommands
	Output -Report1 12A-Rpt1-JavaMainProgramOutput
	Output -Report2 12B-Rpt2-HiveProgramOutputIssuesByMonth
	Output -Report3 12C-Rpt3-HiveProgramOutputTop3Issues
	Oozie web console - screenshots 13-OozieWebConsoleScreenshots
	Sample data
	------------
	May 3 11:52:54 cdh-dn03 init: tty (/dev/tty6) main process (1208) killed by TERM signal
	May 3 11:53:31 cdh-dn03 kernel: registered taskstats version 1
	May 3 11:53:31 cdh-dn03 kernel: sr0: scsi3-mmc drive: 32x/32x xa/form2 tray
	May 3 11:53:31 cdh-dn03 kernel: piix4_smbus 0000:00:07.0: SMBus base address uninitialized - upgrade BIOS or use force_addr=0xaddr
	May 3 11:53:31 cdh-dn03 kernel: nf_conntrack version 0.5.0 (7972 buckets, 31888 max)
	May 3 11:53:57 cdh-dn03 kernel: hrtimer: interrupt took 11250457 ns
	May 3 11:53:59 cdh-dn03 ntpd_initres[1705]: host name not found: 0.rhel.pool.ntp.org

	Structure
	----------
	Month = May
	Day = 3
	Time = 11:52:54
	Node = cdh-dn03
	Process = init:
	Log msg = tty (/dev/tty6) main process (1208) killed by TERM signal
	Data download
	-------------
	Github:
	https://github.com/airawat/OozieSamples

	Email me at [email protected] if you encounter any issues


	Directory structure
	-------------------
	oozieProject
	data
	airawat-syslog
	<<Node-Name>>
	<<Year>>
	<<Month>>
	messages

	sampleCoordinatorJobTrigFileDep
	triggerDir
	trigger.dat

	coordinatorConf/
	coordinator.properties
	coordinator.xml

	workflowApp
	workflow.xml

	hiveSubWorkflowApp
	hive-site.xml
	hiveConsolidated-Year-Month-Report.hql
	hiveTop3Processes-Year-Report.hql
	workflow.xml

	lib
	LogEventCount.jar
	Hdfs load commands
	------------------

	$ hadoop fs -mkdir oozieProject
	$ hadoop fs -put oozieProject/data oozieProject/
	$ hadoop fs -put oozieProject/sampleCoordinatorJobTrigFileDep oozieProject/

	Run command below to validate load against expected directory structure in section 02-DataAndScriptDownload
	$ hadoop fs -ls -R oozieProject/sampleCoordinatorJobTrigFileDep \| awk '{print $8}'

	Remove the trigger file directory - we will load it when we want to execute the job
	$ hadoop fs -rm -R oozieProject/sampleCoordinatorJobTrigFileDep/triggerDir/
	// Source code for Mapper
	//-----------------------------------------------------------
	// LogEventCountMapper.java
	//-----------------------------------------------------------
	// Java program that parses logs using regex
	// The program counts the number of processes logged by year.
	// E.g. Key=2013-ntpd; Value=1;

	package Airawat.Oozie.Samples;

	import java.io.IOException;

	import java.util.regex.Matcher;
	import java.util.regex.Pattern;

	import org.apache.hadoop.io.IntWritable;
	import org.apache.hadoop.io.LongWritable;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapreduce.Mapper;
	import org.apache.hadoop.mapreduce.lib.input.FileSplit;


	public class LogEventCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

	String strLogEntryPattern = "(\\w+)\\s+(\\d+)\\s+(\\d+:\\d+:\\d+)\\s+(\\w+\\W\\w)\\s+(.?\\:)\\s+(.$)";
	public static final int NUM_FIELDS = 6;
	Text strEvent = new Text("");

	@Override
	public void map(LongWritable key, Text value, Context context)
	throws IOException, InterruptedException {

	String strLogEntryLine = value.toString();
	Pattern objPtrn = Pattern.compile(strLogEntryPattern);

	Matcher objPatternMatcher = objPtrn.matcher(strLogEntryLine);
	if (!objPatternMatcher.matches() \|\| NUM_FIELDS != objPatternMatcher.groupCount()) {
	System.err.println("Bad log entry (or problem with RE?):");
	System.err.println(strLogEntryLine);
	return;
	}
	/*
	System.out.println("Month_Name: " + objPatternMatcher.group(1));
	System.out.println("Day: " + objPatternMatcher.group(2));
	System.out.println("Time: " + objPatternMatcher.group(3));
	System.out.println("Node: " + objPatternMatcher.group(4));
	System.out.println("Process: " + objPatternMatcher.group(5));
	System.out.println("LogMessage: " + objPatternMatcher.group(6));
	*/
	//Oh what a pretty chunk of code ;)
	strEvent.set(((FileSplit)context.getInputSplit()).getPath().toString().substring((((FileSplit)context.getInputSplit()).getPath().toString().length()-16), (((FileSplit)context.getInputSplit()).getPath().toString().length()-12)) + "-" + ((objPatternMatcher.group(5).toString().indexOf("[")) == -1 ? (objPatternMatcher.group(5).toString().substring(0,(objPatternMatcher.group(5).length()-1))) : (objPatternMatcher.group(5).toString().substring(0,(objPatternMatcher.group(5).toString().indexOf("["))))));

	context.write(strEvent, new IntWritable(1));

	}
	}
	// Source code for reducer
	//--------------------------
	// LogEventCountReducer.java
	//--------------------------

	package Airawat.Oozie.Samples;
	import java.io.IOException;

	import org.apache.hadoop.io.IntWritable;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapreduce.Reducer;


	public class LogEventCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

	@Override
	public void reduce(Text key, Iterable<IntWritable> values, Context context)
	throws IOException, InterruptedException {
	int intEventCount = 0;

	for (IntWritable value : values) {
	intEventCount += value.get();
	}

	context.write(key, new IntWritable(intEventCount));
	}
	}
	Commands to test the java program
	---------------------------------

	a) Command to run the program

	$ hadoop jar oozieProject/sampleCoordinatorJobTrigFileDep/workflowApp/lib/LogEventCount.jar Airawat.Oozie.Samples.LogEventCount "oozieProject/sampleCoordinatorJobTrigFileDep/data/////*" "oozieProject/sampleCoordinatorJobTrigFileDep/myCLIOutput"

	b) Command to view results

	$ hadoop fs -cat oozieProject/sampleCoordinatorJobTrigFileDep/myCLIOutput/part*

	c) Results

	2013-NetworkManager 7
	2013-console-kit-daemon 7
	2013-gnome-session 11
	2013-init 166
	2013-kernel 810
	2013-login 2
	2013-nm-dispatcher.action 4
	2013-ntpd_initres 4133
	2013-polkit-agent-helper-1 8
	2013-pulseaudio 18
	2013-spice-vdagent 15
	2013-sshd 6
	2013-sudo 8
	2013-udevd 6
	Hive script to create table for logs
	-------------------------------------

	hive>
	CREATE EXTERNAL TABLE SysLogEvents(
	month_name STRING,
	day STRING,
	time STRING,
	host STRING,
	event STRING,
	log STRING)
	PARTITIONED BY(node string,year int, month int)
	ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
	WITH SERDEPROPERTIES (
	"input.regex" = "(\\w+)\\s+(\\d+)\\s+(\\d+:\\d+:\\d+)\\s+(\\w+\\W\\w)\\s+(.?\\:)\\s+(.$)"
	)
	stored as textfile;
	Hive scripts to create and load partitions
	-------------------------------------------

	Note: Replace my user ID "akhanolk" with yours

	hive >

	Alter table SysLogEvents Add IF NOT EXISTS partition(node="cdh-dev01",year=2013, month=04)
	location '/user/akhanolk/oozieProject/data/airawat-syslog/cdh-dev01/2013/04/';
	Alter table SysLogEvents Add IF NOT EXISTS partition(node="cdh-dev01",year=2013, month=05)
	location '/user/akhanolk/oozieProject/data/airawat-syslog/cdh-dev01/2013/05/';

	Alter table SysLogEvents Add IF NOT EXISTS partition(node="cdh-dn01",year=2013, month=05)
	location '/user/akhanolk/oozieProject/data/airawat-syslog/cdh-dn01/2013/05/';

	Alter table SysLogEvents Add IF NOT EXISTS partition(node="cdh-dn02",year=2013, month=04)
	location '/user/akhanolk/oozieProject/data/airawat-syslog/cdh-dn02/2013/04/';
	Alter table SysLogEvents Add IF NOT EXISTS partition(node="cdh-dn02",year=2013, month=05)
	location '/user/akhanolk/oozieProject/data/airawat-syslog/cdh-dn02/2013/05/';

	Alter table SysLogEvents Add IF NOT EXISTS partition(node="cdh-dn03",year=2013, month=04)
	location '/user/akhanolk/oozieProject/data/airawat-syslog/cdh-dn03/2013/04/';
	Alter table SysLogEvents Add IF NOT EXISTS partition(node="cdh-dn03",year=2013, month=05)
	location '/user/akhanolk/oozieProject/data/airawat-syslog/cdh-dn03/2013/05/';

	Alter table SysLogEvents Add IF NOT EXISTS partition(node="cdh-jt01",year=2013, month=04)
	location '/user/akhanolk/oozieProject/data/airawat-syslog/cdh-jt01/2013/04/';
	Alter table SysLogEvents Add IF NOT EXISTS partition(node="cdh-jt01",year=2013, month=05)
	location '/user/akhanolk/oozieProject/data/airawat-syslog/cdh-jt01/2013/05/';

	Alter table SysLogEvents Add IF NOT EXISTS partition(node="cdh-nn01",year=2013, month=05)
	location '/user/akhanolk/oozieProject/data/airawat-syslog/cdh-nn01/2013/05/';

	Alter table SysLogEvents Add IF NOT EXISTS partition(node="cdh-vms",year=2013, month=05)
	location '/user/akhanolk/oozieProject/data/airawat-syslog/cdh-vms/2013/05/';
	Hive ql to test data loaded
	----------------------------

	hive>
	--Print headers
	set hive.cli.print.header=true;

	--Need to add this jar for MR to work..your env may not need it
	add jar hadoop-lib/hive-contrib-0.10.0-cdh4.2.0.jar;

	--Sample query
	select * from SysLogEvents limit 2;