Created
June 8, 2017 17:05
-
-
Save dstreev/23500ef927467abf97bfba38ef500b03 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
################################################################################################ | |
# | |
# Use this to establish a baseline for a cluster by running a variety of Teragen/sort processes. | |
# | |
# With the input parameters, the script will make some basic calculations to determine a good | |
# load profile that can will utilize the cluster to its fullest potential. In order for these | |
# tests to perform consistently, we're making the assumption that the cluster will be FULLY | |
# available for the tests. | |
# | |
# Input parameters: (TODO's) | |
# - Size in Mb, Gb or Tb | |
# - Mappers | |
# - Reducers | |
# | |
# Running this script: | |
# :>nohup ./<script> --size <..> > results.out & | |
# :>tail -f results.out | |
# This will log the script output to a file for safe keeping. The results should be saved. | |
################################################################################################ | |
GEN_ONLY=false | |
# 1TB across 1000 mappers with create 1G part files | |
SIZE=1Tb | |
# Make adjustments based on your cluster. Containers * Nodes... | |
#Defaults | |
# 2048 Mappers will generate 512Mb Files | |
# 1024 Mappers will generate 1Gb Files | |
TERAGEN_MAPPERS=1024 | |
# Estimates | |
# 8192 Reducers will sort 128Mb | |
# 4096 Reducers will sort 256Mb | |
# 2048 Reducers will sort 512Mb | |
TERASORT_REDUCERS=4096 | |
ADDITIONAL_DIRECTIVES= | |
while [ $# -gt 0 ]; do | |
case "$1" in | |
--size) | |
shift | |
SIZE=$1 | |
shift | |
;; | |
--mappers) | |
shift | |
TERAGEN_MAPPERS=$1 | |
shift | |
;; | |
--reducers) | |
shift | |
TERASORT_REDUCERS=$1 | |
shift | |
;; | |
--queue) | |
shift | |
ADDITIONAL_DIRECTIVES="${ADDITIONAL_DIRECTIVES} -Dmapred.job.queue.name=$1" | |
shift | |
;; | |
# Support coming in Hadoop 2.7.4+ | |
#--label) | |
# shift | |
# ADDITIONAL_DIRECTIVES=$ADDITIONAL_DIRECTIVES + " " $1 | |
# shift | |
# ;; | |
--gen-only) | |
shift | |
GEN_ONLY=true | |
;; | |
--help) | |
echo "Usage: $0 --size <size in GB or TB 1Tb or 500Gb or 50Mb> --mappers <number of mappers for teragen> --reducers <number of reducers for sort>" | |
exit -1 | |
;; | |
*) | |
shift | |
ADDITIONAL_DIRECTIVES=$ADDITIONAL_DIRECTIVES + " " $1 | |
;; | |
esac | |
done | |
echo "Running with... Size=${SIZE} , Mappers=${TERAGEN_MAPPERS} , Reducers=${TERASORT_REDUCERS}, Additional Directives=${ADDITIONAL_DIRECTIVES}" | |
VALUE="" | |
MULTIPLIER="" | |
if [[ "$SIZE" =~ "Mb" ]]; then | |
# echo "In Megabytes!" | |
MULTIPLIER=2 | |
VALUE=`echo $SIZE | awk -F'Mb' '{print $1}'` | |
fi | |
if [[ "$SIZE" =~ "Gb" ]]; then | |
# echo "In Gigabytes!" | |
MULTIPLIER=3 | |
VALUE=`echo $SIZE | awk -F'Gb' '{print $1}'` | |
fi | |
if [[ "$SIZE" =~ "Tb" ]]; then | |
# echo "In Terabytes!" | |
MULTIPLIER=4 | |
VALUE=`echo $SIZE | awk -F'Tb' '{print $1}'` | |
fi | |
if [ "${VALUE}" == "" ]; then | |
echo "Unable to determine size. Use Mb, Gb or Tb. IE: 500Mb or 36Gb or 1Tb" | |
echo "Usage: $0 --size <size in GB or TB 1Tb or 500Gb or 50Mb> --mappers <number of mappers for teragen> --reducers <number of reducers for sort>" | |
exit -1 | |
fi | |
BASE_DIR=./perf | |
# Convert Size to Teragen Rows. | |
GEN_ROWS=`echo "($VALUE*1024^$MULTIPLIER)/100" | bc` | |
LC_NUMERIC=en_US | |
GEN_ROWS_COUNT=`printf "%'.f\n" ${GEN_ROWS}` | |
echo "TeraGen/Sort Rows: ${GEN_ROWS_COUNT}" | |
MR_EXAMPLES_JAR=/usr/hdp/current/hadoop-mapreduce-client/hadoop-mapreduce-examples.jar | |
if [ ! -f $MR_EXAMPLES_JAR ]; then | |
echo "Couldn't find jar file with teragen/sort: $MR_EXAMPLES_JAR" | |
exit -1 | |
fi | |
hdfs dfs -mkdir $BASE_DIR | |
# MR2 | |
MAPPER_COUNT_KEY=mapreduce.job.maps | |
RECUDER_COUNT_KEY=mapreduce.job.reduces | |
DFS_BLOCK_SIZE_KEY=dfs.blocksize | |
# Using powers of 2 to establish 64M - 1G Blocksize attempts. | |
# 26 = 64M | |
# 27 = 128M | |
# 28 = 256M | |
# 29 = 512M | |
# 30 = 1GB | |
for bsp in 27; do | |
# for bsp in 26 27 28 29 30; do | |
DFS_BLOCK_SIZE=`echo "2^$bsp" | bc` | |
echo "BLOCK SIZE: $DFS_BLOCK_SIZE" | |
hdfs dfs -rm -r -skipTrash $BASE_DIR/teragen_$bsp | |
hadoop jar $MR_EXAMPLES_JAR teragen $ADDITIONAL_DIRECTIVES -D$DFS_BLOCK_SIZE_KEY=$DFS_BLOCK_SIZE -D$MAPPER_COUNT_KEY=$TERAGEN_MAPPERS $GEN_ROWS $BASE_DIR/teragen_$bsp 2>&1 | |
if [ "${GEN_ONLY}" == "false" ]; then | |
hdfs dfs -rm -r -skipTrash $BASE_DIR/terasort_$bsp | |
hadoop jar $MR_EXAMPLES_JAR terasort $ADDITIONAL_DIRECTIVES -D$RECUDER_COUNT_KEY=$TERASORT_REDUCERS $BASE_DIR/teragen_$bsp $BASE_DIR/terasort_$bsp 2>&1 | |
fi | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment