Last active
April 12, 2023 03:30
-
-
Save jbenninghoff/c595c304ae8677a1b55ddfc4fe74960b to your computer and use it in GitHub Desktop.
Launch XML extract job in S3
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# Use this to capture output into var using read pipeline | |
set +m; shopt -s lastpipe | |
# Copy JARs and XML locally, files needed as args to job launch | |
aws s3 cp s3://jobennin-emr-data/hp-mapr/java_extraction_byteswritable.jar . | |
aws s3 cp s3://jobennin-emr-data/hp-mapr/configint.xml . | |
aws s3 cp s3://jobennin-emr-data/hp-mapr/commons-lang-2.6.jar . | |
hadoop jar java_extraction_byteswritable.jar qari.XmlExtraction \ | |
-libjars $PWD/commons-lang-2.6.jar \ | |
-D mapreduce.job.reduces=1 \ | |
-D mapreduce.map.memory.mb=4000 -D mapreduce.map.java.opts="-Xmx3600m" \ | |
-D mapreduce.reduce.memory.mb=36864 -D mapreduce.reduce.java.opts="-Xmx33648m" \ | |
-D mapreduce.reduce.cpu.vcores=8 \ | |
-D mapreduce.reduce.shuffle.parallelcopies=8 \ | |
-D mapreduce.task.io.sort.mb=800 \ | |
-D mapreduce.task.io.sort.factor=80 \ | |
-D mapreduce.map.sort.spill.percent=1.0 \ | |
-D mapreduce.map.speculative=false \ | |
-D mapreduce.map.output.compress=true \ | |
-D mapreduce.input.fileinputformat.split.minsize=41943040 \ | |
s3://jobennin-emr-data/hp-mapr/input/ s3://jobennin-emr-data/hp-mapr/output/ configint.xml \ | |
> /tmp/qari.XmlExtraction.stdout.log | |
# Log the job history stats to S3 | |
#jobnum=$(egrep -o 'job_[0-9]+_[0-9]+' ~/qari.XmlExtraction.stdout.log |sort -u) | |
cid=$(jq -r .jobFlowId /mnt/var/lib/info/job-flow.json) | |
mapred job -list all |& egrep -o 'job_[0-9]+_[0-9]+' |sort -n |tail -1 |read jobnum | |
mapred job -history $jobnum | aws s3 cp - s3://jb-workday-logs/hp-mapr/${cid}-XmlExtract.hist | |
aws s3 cp /tmp/qari.XmlExtraction.stdout.log s3://jb-workday-logs/hp-mapr/${cid}-XmlExtract.stdout.log | |
#https://hadoop.apache.org/docs/r2.10.0/hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml | |
# -D mapreduce.fileoutputcommitter.algorithm.version=2 \ | |
# -D mapreduce.map.memory.mb=5280 -D mapreduce.map.java.opts="-Xmx4800m" \ | |
# -D mapreduce.job.reduce.slowstart.completedmaps=0.01 \ | |
# -D mapreduce.input.fileinputformat.split.minsize=20971520 | |
# -D mapreduce.shuffle.connection-keep-alive.enable=true | |
# -D mapreduce.task.io.sort.factor=48 | |
# -D mapreduce.reduce.memory.mb=20480 -D mapreduce.reduce.java.opts="-Xmx18432m" \ | |
#time hadoop jar java_extraction_byteswritable.jar qari.XmlExtraction -D mapreduce.job.reduces=1 /user/hadoop/input/ /user/hadoop/output/ configint.xml |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment