Skip to content

Instantly share code, notes, and snippets.

@cozek
Created January 19, 2019 04:02
Show Gist options
  • Save cozek/264d0be08b372704638455ce8402a707 to your computer and use it in GitHub Desktop.
Save cozek/264d0be08b372704638455ce8402a707 to your computer and use it in GitHub Desktop.
Python 3 script for easily running hadoop mapreduce programs
#!/usr/bin/env python3
#DISCLAMER: Provided with no warranty, you are responsible for whatever may befall you
# or your property as a result of using this script.
#You implicitly agree to this by running this script.
#Feel free to improve, modify and distribute
import os
import sys
import subprocess
input_directory = " /user/cosec/words.txt " #file must already exist in your hdfs
output_directory = " /user/cosec/op3" #location in HDFS where you want your output in
program_name = " WordCount " #name of your .java file ?
jar_file = " cw.jar " #the jar package you want to create
javaProgram = " WordCount.java " #your program name here, program should be compatible with your "hadoop version"
cleanJar = "rm -f "+ jar_file + " rm -f *.class *.jar"
compile_java = "hadoop com.sun.tools.javac.Main " + javaProgram
create_jar = "jar cf " + jar_file + " *.class "
hadoop = "hadoop jar " + jar_file + program_name + input_directory + output_directory
## what it looks like.
# hadoop = "hadoop jar cw.jar WordCount /user/cosec/words.txt /user/cosec/op3"
#env variable JAVA_HOME differ for for each ready-made hadoop development platforms. set it accordingly
#easy way is to find it is type /user/java/ and press TAB and let the shell fill it up for you
#the default for cloudera cdh 5.3 is setup below
env = """export JAVA_HOME=/usr/java/jdk1.7.0_67-cloudera/
export HADOOP_CLASSPATH=${JAVA_HOME}/lib/tools.jar"""
cleanHDFS= "hdfs dfs -rm -r " + output_directory
if sys.argv[1] == 'setenv':
os.system(env)
if sys.argv[1] == 'make':
os.system(compile_java)
os.system(create_jar)
if sys.argv[1] == 'cleanDFS':
subprocess.call(cleanHDFS,shell = True)
if sys.argv[1] == 'cleanJar':
subprocess.call(cleanJar,shell = True)
if sys.argv[1] == 'run':
subprocess.call(hadoop,shell = True)
if sys.argv[1] =='getoutput':
subprocess.call("hdfs dfs -cat " + output_directory + "/part*",shell =True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment