v5tech · May 4, 2014 07:19
diff --git a/00-LogParser-PythonMR-UsingRegex b/00-LogParser-PythonMR-UsingRegex
 This gist includes a mapper and reducer in python that can parse log files using 
 regex; Usecase: Count the number of occurances of processes that got logged by month.

 Includes:
 ---------
 Sample data
 Review of log data structure
 Sample data and scripts for download
 Mapper
 Reducer
 Commands
 Sample output

diff --git a/01-SampleData b/01-SampleData
 Sample data
 ------------
 May  3 11:52:54 cdh-dn03 init: tty (/dev/tty6) main process (1208) killed by TERM signal
 May  3 11:53:31 cdh-dn03 kernel: registered taskstats version 1
 May  3 11:53:31 cdh-dn03 kernel: sr0: scsi3-mmc drive: 32x/32x xa/form2 tray
 May  3 11:53:31 cdh-dn03 kernel: piix4_smbus 0000:00:07.0: SMBus base address uninitialized - upgrade BIOS or use force_addr=0xaddr
 May  3 11:53:31 cdh-dn03 kernel: nf_conntrack version 0.5.0 (7972 buckets, 31888 max)
 May  3 11:53:57 cdh-dn03 kernel: hrtimer: interrupt took 11250457 ns
 May  3 11:53:59 cdh-dn03 ntpd_initres[1705]: host name not found: 0.rhel.pool.ntp.org
diff --git a/02-LogStructure b/02-LogStructure
 Structure of data:
 ------------------
 Sample-
 May  3 11:52:54 cdh-dn03 init: tty (/dev/tty6) main process (1208) killed by TERM signal

 Structure-
 Month   = May  
 Day     = 3 
 Time    = 11:52:54 
 Node    = cdh-dn03 
 Process = init: 
 Log msg = tty (/dev/tty6) main process (1208) killed by TERM signal
diff --git a/03-LinkToScriptsAndSampleData b/03-LinkToScriptsAndSampleData
 Link to sample data and scripts: 
 --------------------------------
 This is the link to the zip file containing the data and the scripts.
 https://groups.google.com/forum/?hl=en#!topic/hadooped/MlTLyqw_DUg

 Directory Structure
 -------------------
 LogParserSamplePython	
    Data
        airawat-syslog
            2013
                04
                    messages
            2013
                05
                    messages
    LogParserReducer.py  
    LogParserMapper.py

diff --git a/04-LogParserMapper.py b/04-LogParserMapper.py
 # This is the mapper file [LogParserMapper.py]
 #!/usr/bin/env python

 import sys
 import re
 import os
 
 data_pattern = r"(\w+)\s+(\d+)\s+(\d+:\d+:\d+)\s+(\w+\W*\w*)\s+(.*?\:)\s+(.*$)"
 regex_obj = re.compile(data_pattern, re.VERBOSE)

 # filepath = os.environ["Data/*/*/*/*"]
 # filename = os.path.split(filepath)[-1]


 #--- get all lines from stdin ---
 for strLineRead in sys.stdin:

    #--- remove leading and trailing whitespace---
    strLineRead = strLineRead.strip()

    #--- split the line into fields ---
    parsed_log = ""
    parsed_log = regex_obj.search(strLineRead)

    if parsed_log:
   #--- output key-value pair---
        print '%s\t%s' % (parsed_log.group(1) + "-" + parsed_log.group(5), "1")


 	#print "month_name:	 ", parsed_log.group(1)
 	#print "day:		 ", parsed_log.group(2)
 	#print "time:      	 ", parsed_log.group(3)
 	#print "node:      	 ", parsed_log.group(4)
 	#print "event:      	 ", parsed_log.group(5)
 	#print "message:      	 ", parsed_log.group(6)
diff --git a/05-LogParserReducer.py b/05-LogParserReducer.py
 # This is the reducer file [LogParserReducer.py]

 #!/usr/bin/env python
 import sys
 
 eventCountArray = {}
 
 # Input is from STDIN
 for line in sys.stdin:
    # Remove leading and trailing whitespace
    line = line.strip()
 
    # Parse the input from the mapper
    event, count = line.split('\t', 1)
    
    # Cast count to int
    try:
        count = int(count)
    except ValueError:
        continue

    # Compute event count
    try:
        eventCountArray[event] = eventCountArray[event]+count
    except:
        eventCountArray[event] = count
 
 # Write the results (unsorted) to stdout
 for event in eventCountArray.keys():
    print '%s\t%s'% ( event, eventCountArray[event] )
diff --git a/06-JobCommand b/06-JobCommand
 The following are a listing of commands:
 ----------------------------------------


 #Command to test the mapper
 cat Data/*/*/*/* | python LogParserMapper.py

 #Command to test the mapper and reducer
 cat Data/*/*/*/* | python LogParserMapper.py | sort | python LogParserReducer.py | sort

 #Command to load application to hdfs
 hadoop fs -put oozieProject/pythonApplication/ oozieProject/

 #Command to run on cluster
 hadoop jar <<path to hadoop streaming jar>> -D mapred.reduce.tasks=<<num tasks>>  -file <<path to mapper script>>  -mapper <<mapper file>>  -file <<path to reducer script>>  -reducer <<reducer script name>>    -input <<input directory>> -output <<output directory>>

 #Sample command
 hadoop jar /opt/cloudera/parcels/CDH-4.2.0-1.cdh4.2.0.p0.10/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh4.2.0.jar  -D mapred.reduce.tasks=1  -file oozieProject/pythonApplication/LogParserMapper.py  -mapper oozieProject/pythonApplication/LogParserMapper.py  -file oozieProject/pythonApplication/LogParserReducer.py  -reducer oozieProject/pythonApplication/LogParserReducer.py    -input oozieProject/pythonApplication/Data/*/*/*/* -output oozieProject/pythonApplication/output-streaming

 #View output
 $ hadoop fs -cat oozieProject/pythonApplication/output-streaming/part-00000
diff --git a/07-SampleOutput b/07-SampleOutput
 Sample output
 -------------
 May-kernel:  58
 Apr-sudo:	1
 May-udevd[361]:	1
 May-init:	23
 May-ntpd_initres[1705]:	792
 May-sudo:	1
	This gist includes a mapper and reducer in python that can parse log files using
	regex; Usecase: Count the number of occurances of processes that got logged by month.

	Includes:
	---------
	Sample data
	Review of log data structure
	Sample data and scripts for download
	Mapper
	Reducer
	Commands
	Sample output
	Sample data
	------------
	May 3 11:52:54 cdh-dn03 init: tty (/dev/tty6) main process (1208) killed by TERM signal
	May 3 11:53:31 cdh-dn03 kernel: registered taskstats version 1
	May 3 11:53:31 cdh-dn03 kernel: sr0: scsi3-mmc drive: 32x/32x xa/form2 tray
	May 3 11:53:31 cdh-dn03 kernel: piix4_smbus 0000:00:07.0: SMBus base address uninitialized - upgrade BIOS or use force_addr=0xaddr
	May 3 11:53:31 cdh-dn03 kernel: nf_conntrack version 0.5.0 (7972 buckets, 31888 max)
	May 3 11:53:57 cdh-dn03 kernel: hrtimer: interrupt took 11250457 ns
	May 3 11:53:59 cdh-dn03 ntpd_initres[1705]: host name not found: 0.rhel.pool.ntp.org
	Structure of data:
	------------------
	Sample-
	May 3 11:52:54 cdh-dn03 init: tty (/dev/tty6) main process (1208) killed by TERM signal

	Structure-
	Month = May
	Day = 3
	Time = 11:52:54
	Node = cdh-dn03
	Process = init:
	Log msg = tty (/dev/tty6) main process (1208) killed by TERM signal
	Link to sample data and scripts:
	--------------------------------
	This is the link to the zip file containing the data and the scripts.
	https://groups.google.com/forum/?hl=en#!topic/hadooped/MlTLyqw_DUg

	Directory Structure
	-------------------
	LogParserSamplePython
	Data
	airawat-syslog
	2013
	04
	messages
	2013
	05
	messages
	LogParserReducer.py
	LogParserMapper.py
	# This is the mapper file [LogParserMapper.py]
	#!/usr/bin/env python

	import sys
	import re
	import os

	data_pattern = r"(\w+)\s+(\d+)\s+(\d+:\d+:\d+)\s+(\w+\W\w)\s+(.?\:)\s+(.$)"
	regex_obj = re.compile(data_pattern, re.VERBOSE)

	# filepath = os.environ["Data////"]
	# filename = os.path.split(filepath)[-1]


	#--- get all lines from stdin ---
	for strLineRead in sys.stdin:

	#--- remove leading and trailing whitespace---
	strLineRead = strLineRead.strip()

	#--- split the line into fields ---
	parsed_log = ""
	parsed_log = regex_obj.search(strLineRead)

	if parsed_log:
	#--- output key-value pair---
	print '%s\t%s' % (parsed_log.group(1) + "-" + parsed_log.group(5), "1")


	#print "month_name: ", parsed_log.group(1)
	#print "day: ", parsed_log.group(2)
	#print "time: ", parsed_log.group(3)
	#print "node: ", parsed_log.group(4)
	#print "event: ", parsed_log.group(5)
	#print "message: ", parsed_log.group(6)
	# This is the reducer file [LogParserReducer.py]

	#!/usr/bin/env python
	import sys

	eventCountArray = {}

	# Input is from STDIN
	for line in sys.stdin:
	# Remove leading and trailing whitespace
	line = line.strip()

	# Parse the input from the mapper
	event, count = line.split('\t', 1)

	# Cast count to int
	try:
	count = int(count)
	except ValueError:
	continue

	# Compute event count
	try:
	eventCountArray[event] = eventCountArray[event]+count
	except:
	eventCountArray[event] = count

	# Write the results (unsorted) to stdout
	for event in eventCountArray.keys():
	print '%s\t%s'% ( event, eventCountArray[event] )
	The following are a listing of commands:
	----------------------------------------


	#Command to test the mapper
	cat Data//// \| python LogParserMapper.py

	#Command to test the mapper and reducer
	cat Data//// \| python LogParserMapper.py \| sort \| python LogParserReducer.py \| sort

	#Command to load application to hdfs
	hadoop fs -put oozieProject/pythonApplication/ oozieProject/

	#Command to run on cluster
	hadoop jar <<path to hadoop streaming jar>> -D mapred.reduce.tasks=<<num tasks>> -file <<path to mapper script>> -mapper <<mapper file>> -file <<path to reducer script>> -reducer <<reducer script name>> -input <<input directory>> -output <<output directory>>

	#Sample command
	hadoop jar /opt/cloudera/parcels/CDH-4.2.0-1.cdh4.2.0.p0.10/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh4.2.0.jar -D mapred.reduce.tasks=1 -file oozieProject/pythonApplication/LogParserMapper.py -mapper oozieProject/pythonApplication/LogParserMapper.py -file oozieProject/pythonApplication/LogParserReducer.py -reducer oozieProject/pythonApplication/LogParserReducer.py -input oozieProject/pythonApplication/Data//// -output oozieProject/pythonApplication/output-streaming

	#View output
	$ hadoop fs -cat oozieProject/pythonApplication/output-streaming/part-00000
	Sample output
	-------------
	May-kernel: 58
	Apr-sudo: 1
	May-udevd[361]: 1
	May-init: 23
	May-ntpd_initres[1705]: 792
	May-sudo: 1