arbakker · June 28, 2022 09:52
diff --git a/README.md b/README.md
diff --git a/benchmark.sh b/benchmark.sh
 #!/usr/bin/env bash
 set -euo pipefail

 RASTER_SIZES="${1}"
 OUTPUT_DIR="${2}"
 COVERAGES="${3}"
 BASE_URL="${4}"
 NR_OF_REQUEST="${5:-50}"
 NR_OF_PARALLEL_REQUEST="${6:-2}"

 BBOXBOUND_MINX=140100
 BBOXBOUND_MINY=450100
 BBOXBOUND_MAXX=217900
 BBOXBOUND_MAXY=497900


 mkdir -p "$OUTPUT_DIR"
 function get_output_filename() {
    OUTPUT_FILE="$1"
    output_filename="${OUTPUT_FILE%.*}"
    output_extension="${OUTPUT_FILE##*.}"
    i=0
    while true; do
        if [[ $i -eq 0 ]]; then
            test_filename=$OUTPUT_FILE
        else
            test_filename="${output_filename}_${i}.${output_extension}"
        fi
        i=$((i + 1))
        if [[ -f $test_filename ]]; then
            continue
        else
            break
        fi
    done
    echo "$test_filename"
 }


 function gen_random_bbox() {
    minx=$(shuf -i $BBOXBOUND_MINX-$BBOXBOUND_MAXX -n1)
    maxx=$((minx + $MAX_SIZE))
    miny=$(shuf -i $BBOXBOUND_MINY-$BBOXBOUND_MAXY -n1)
    maxy=$((miny + $MAX_SIZE))
    echo "${minx},${miny},${maxx},${maxy}"
 }

 function get_cov_request() {
    cov=$1
    bbox=$2
    min_x=$(echo "$bbox" | awk -F, '{print $1}')
    min_y=$(echo "$bbox" | awk -F, '{print $2}')
    max_x=$(echo "$bbox" | awk -F, '{print $3}')
    max_y=$(echo "$bbox" | awk -F, '{print $4}')
    width=$(((max_x - min_x) * 2))
    height=$(((max_y - min_y) * 2))

    url="${BASE_URL}?service=WCS&version=1.0.0&request=GetCoverage&coverage=${cov}&crs=EPSG:28992&response_crs=EPSG:28992&bbox=${min_x},${min_y},${max_x},${max_y}&width=${width}&height=${height}&format=image/tiff"
    echo "$url"
    start_time=$(date +%s.%3N)

    content_type=$(curl -w '%{content_type}' --max-time 60 -s "$url" -o /dev/null)
    end_time=$(date +%s.%3N)
    elapsed=$(echo "scale=3; $end_time - $start_time" | bc)
    if [[ $content_type == "image/tiff" ]];then
        echo "${cov},${elapsed}" >>"$OUTPUT_FILE"
    fi
 }

 for RASTER_SIZE in $RASTER_SIZES;do
    MAX_SIZE=$((RASTER_SIZE / 2))
    OUTPUT_SUFFIX="${NR_OF_PARALLEL_REQUEST}_${RASTER_SIZE}"
    OUTPUT_FILE_TEMPLATE="${OUTPUT_DIR}/wcs-performance-par-${OUTPUT_SUFFIX}.csv"
    OUTPUT_FILE="$(get_output_filename $OUTPUT_FILE_TEMPLATE)"

    bboxes=$(for run in $(seq 1 $NR_OF_REQUEST); do gen_random_bbox; done)

    export -f get_cov_request
    export OUTPUT_FILE
    export BASE_URL

    for coverage in $COVERAGES;do
        echo "$bboxes" | parallel -j$NR_OF_PARALLEL_REQUEST "get_cov_request ${coverage} {}"
    done
 done
diff --git a/plot.sh b/plot.sh
 #!/usr/bin/env bash
 set -euo pipefail
 OUTPUT_DIR="${1}"
 COVS="${2}"
 CSVFILE="${OUTPUT_DIR}/stats.csv"

 # generate csv for gnuplot to use
 # csv looks like:
 # size,cov1-avg,cov1-stddev,cov2-avg,cov2-stddev,etc...
 function process_stats(){
    header="size"
    for c in $COVS; do
        c=$(tr "_" "-" <<< $c)
        header="${header},${c}-avg,${c}-stddev"
    done
    csv=$(printf '%s\n\n' "${header}")

    for file in $(find "$OUTPUT_DIR" -regex ".*/.*_[0-9]+.csv$" | sort -V);do
        size=$(sed -En "s/.*?_([0-9]+)\.csv$/\1/p" <<< "$file")
        line="${size}"
        for coverage in $COVS;do
            stats=$(grep < "$file" "$coverage" |  cut -d, -f2 |  python3 -c "import statistics,sys;vals=[float(x.rstrip()) for x in sys.stdin];print(f\"{statistics.mean(vals):.3f},{statistics.stdev(vals):.3f}\")")
            avg=$(cut <<< "$stats" -d, -f1)
            stddev=$(cut <<< "$stats" -d, -f2)
            line="${line},${avg},${stddev}"
        done
        csv=$(printf '%s\n%s' "$csv" "$line")
    done
    echo "$csv" > "$CSVFILE"
 }

 # shellcheck disable=SC2120
 function plot(){

    input_csv="${1}"
    output="${2:-stats.png}"
    range="${3:-[0:4100]}"

    nr_columns=$(head -n1 $input_csv | tr ',' '\n' | wc -l)
    plot_command="plot "
    for i in $(seq 2 2 $nr_columns);do # start a 2 to skip count columns, step 2 since avg and stdev
        plot_command="${plot_command}'${input_csv}' using 1:${i}  with lines, '${input_csv}' using 1:${i}:$((i+1)) with yerrorbars,"
    done
    rm -f "$output"
 gnuplot << EOF
 set terminal png size 1000,800
 set key left top
 set xrange $range
 set ylabel "Response time (in seconds)"
 set xlabel "Raster size (nr of pixels)"
 set output '$output'
 set grid ytics mytics  # draw lines for each ytics and mytics
 set mytics 2           # set the spacing for the mytics
 set grid               # enable the grid
 set key autotitle columnhead
 set datafile separator ","
 ${plot_command}
 EOF
 }

 process_stats
 plot "$CSVFILE" "${OUTPUT_DIR}/stats.png"
 # plot "$CSVFILE" "${OUTPUT_DIR}/0-500-stats.png" "[0:600]"
 # plot "$CSVFILE" "${OUTPUT_DIR}/0-1000-stats.png" "[0:1100]"
 # plot "$CSVFILE" "${OUTPUT_DIR}/1000-2000-stats.png" "[900:2100]"
 # plot "$CSVFILE" "${OUTPUT_DIR}/2000-4000-stats.png" "[1900:4100]"
diff --git a/process.sh b/process.sh
 #!/usr/bin/env bash

 tempfile=$(mktemp)
 echo comp,size,min,max,median,avg,stddev >> $tempfile
 # combine all stats in one csv
 for c in none zstd deflate;do
    for s in 10 20 50 100 200 500 1000 1500 2000 3000 4000;do #4000
        file=output/ahn3_05m_dsm_${c}_stats_2_${s}.txt
        min=$(grep < "$file" "x  50" | awk '{ print $3 }')
        max=$(grep < "$file" "x  50" | awk '{ print $4 }')
        median=$(grep < "$file" "x  50" | awk '{ print $5 }')
        avg=$(grep < "$file" "x  50" | awk '{ print $6 }')
        stddev=$(grep < "$file" "x  50" | awk '{ print $7 }')
        echo $c,$s,$min,$max,$median,$avg,$stddev >> $tempfile
    done
 done

 # transpose combined stats
 csv=$(printf '%s\n\n' "size,none-avg,none-stddev,deflate-avg,deflate-stddev,zstd-avg,zstd-stddev")
 for size in $(tail -n+2 $tempfile | cut -d, -f2 |   sort -n -u);do
    size_output=$(mlr --csv  filter '$size == '$size $tempfile)
    csv=$(printf '%s\n%s' "$csv" "${size}," )
    for comp in none deflate zstd;do
        comp_size_output=$(mlr --csv filter "\$comp == \"${comp}\"" <<< "$size_output")
        avg=$(mlr --csv --headerless-csv-output cut -f avg <<< "$comp_size_output")
        stddev=$(mlr --csv --headerless-csv-output cut -f stddev <<< "$comp_size_output")
        csv="${csv}${avg},${stddev},"
    done
    csv=$(printf '%s\n' "$csv")
 done

 OUTPUT_DIR="${1:-output}"
 output_file="${OUTPUT_DIR}/plot_stats.csv"
 echo "$csv" > "$output_file"
diff --git a/run-benchmarks.sh b/run-benchmarks.sh
 #!/usr/bin/env bash
 # script to glue together benchmark.sh and plot.sh
 set -euo pipefail
 function run-benchmark(){
    OUTPUT_DIR="$1"
    COVERAGES="$2" # "ahn3_05m_dsm" | "ahn3_05m_dsm_none ahn3_05m_dsm_deflate ahn3_05m_dsm_zstd"
    URL="$3" # "https://geodata.nationaalgeoregister.nl/ahn3/wcs" | "https://service.pdok.nl/rws/ahn3/wcs/v1_0-preprod"
    RASTER_SIZES="${4:-10 20 50 100 200 500 1000 1500 2000 3000 4000}"
    NR_OF_REQS="${5:-50}"
    PARALLEL_REQS=${6:-2}

    rm -rf "$OUTPUT_DIR"
    ./benchmark.sh "$RASTER_SIZES" "$OUTPUT_DIR" "$COVERAGES" "$URL" "$NR_OF_REQS" "$PARALLEL_REQS"
    ./plot.sh "$OUTPUT_DIR" "$COVERAGES"
 }

 run-benchmark output-geotiff-nfs "ahn3_05m_dsm" https://geodata.nationaalgeoregister.nl/ahn3/wcs
 run-benchmark output-cog-s3-150 "ahn3_05m_dsm_none ahn3_05m_dsm_deflate ahn3_05m_dsm_zstd" https://service.pdok.nl/rws/ahn3/wcs/v1_0-preprod
	#!/usr/bin/env bash
	set -euo pipefail

	RASTER_SIZES="${1}"
	OUTPUT_DIR="${2}"
	COVERAGES="${3}"
	BASE_URL="${4}"
	NR_OF_REQUEST="${5:-50}"
	NR_OF_PARALLEL_REQUEST="${6:-2}"

	BBOXBOUND_MINX=140100
	BBOXBOUND_MINY=450100
	BBOXBOUND_MAXX=217900
	BBOXBOUND_MAXY=497900


	mkdir -p "$OUTPUT_DIR"
	function get_output_filename() {
	OUTPUT_FILE="$1"
	output_filename="${OUTPUT_FILE%.*}"
	output_extension="${OUTPUT_FILE##*.}"
	i=0
	while true; do
	if [[ $i -eq 0 ]]; then
	test_filename=$OUTPUT_FILE
	else
	test_filename="${output_filename}_${i}.${output_extension}"
	fi
	i=$((i + 1))
	if [[ -f $test_filename ]]; then
	continue
	else
	break
	fi
	done
	echo "$test_filename"
	}


	function gen_random_bbox() {
	minx=$(shuf -i $BBOXBOUND_MINX-$BBOXBOUND_MAXX -n1)
	maxx=$((minx + $MAX_SIZE))
	miny=$(shuf -i $BBOXBOUND_MINY-$BBOXBOUND_MAXY -n1)
	maxy=$((miny + $MAX_SIZE))
	echo "${minx},${miny},${maxx},${maxy}"
	}

	function get_cov_request() {
	cov=$1
	bbox=$2
	min_x=$(echo "$bbox" \| awk -F, '{print $1}')
	min_y=$(echo "$bbox" \| awk -F, '{print $2}')
	max_x=$(echo "$bbox" \| awk -F, '{print $3}')
	max_y=$(echo "$bbox" \| awk -F, '{print $4}')
	width=$(((max_x - min_x) * 2))
	height=$(((max_y - min_y) * 2))

	url="${BASE_URL}?service=WCS&version=1.0.0&request=GetCoverage&coverage=${cov}&crs=EPSG:28992&response_crs=EPSG:28992&bbox=${min_x},${min_y},${max_x},${max_y}&width=${width}&height=${height}&format=image/tiff"
	echo "$url"
	start_time=$(date +%s.%3N)

	content_type=$(curl -w '%{content_type}' --max-time 60 -s "$url" -o /dev/null)
	end_time=$(date +%s.%3N)
	elapsed=$(echo "scale=3; $end_time - $start_time" \| bc)
	if [[ $content_type == "image/tiff" ]];then
	echo "${cov},${elapsed}" >>"$OUTPUT_FILE"
	fi
	}

	for RASTER_SIZE in $RASTER_SIZES;do
	MAX_SIZE=$((RASTER_SIZE / 2))
	OUTPUT_SUFFIX="${NR_OF_PARALLEL_REQUEST}_${RASTER_SIZE}"
	OUTPUT_FILE_TEMPLATE="${OUTPUT_DIR}/wcs-performance-par-${OUTPUT_SUFFIX}.csv"
	OUTPUT_FILE="$(get_output_filename $OUTPUT_FILE_TEMPLATE)"

	bboxes=$(for run in $(seq 1 $NR_OF_REQUEST); do gen_random_bbox; done)

	export -f get_cov_request
	export OUTPUT_FILE
	export BASE_URL

	for coverage in $COVERAGES;do
	echo "$bboxes" \| parallel -j$NR_OF_PARALLEL_REQUEST "get_cov_request ${coverage} {}"
	done
	done
	#!/usr/bin/env bash
	set -euo pipefail
	OUTPUT_DIR="${1}"
	COVS="${2}"
	CSVFILE="${OUTPUT_DIR}/stats.csv"

	# generate csv for gnuplot to use
	# csv looks like:
	# size,cov1-avg,cov1-stddev,cov2-avg,cov2-stddev,etc...
	function process_stats(){
	header="size"
	for c in $COVS; do
	c=$(tr "_" "-" <<< $c)
	header="${header},${c}-avg,${c}-stddev"
	done
	csv=$(printf '%s\n\n' "${header}")

	for file in $(find "$OUTPUT_DIR" -regex "./._[0-9]+.csv$" \| sort -V);do
	size=$(sed -En "s/.*?_([0-9]+)\.csv$/\1/p" <<< "$file")
	line="${size}"
	for coverage in $COVS;do
	stats=$(grep < "$file" "$coverage" \| cut -d, -f2 \| python3 -c "import statistics,sys;vals=[float(x.rstrip()) for x in sys.stdin];print(f\"{statistics.mean(vals):.3f},{statistics.stdev(vals):.3f}\")")
	avg=$(cut <<< "$stats" -d, -f1)
	stddev=$(cut <<< "$stats" -d, -f2)
	line="${line},${avg},${stddev}"
	done
	csv=$(printf '%s\n%s' "$csv" "$line")
	done
	echo "$csv" > "$CSVFILE"
	}

	# shellcheck disable=SC2120
	function plot(){

	input_csv="${1}"
	output="${2:-stats.png}"
	range="${3:-[0:4100]}"

	nr_columns=$(head -n1 $input_csv \| tr ',' '\n' \| wc -l)
	plot_command="plot "
	for i in $(seq 2 2 $nr_columns);do # start a 2 to skip count columns, step 2 since avg and stdev
	plot_command="${plot_command}'${input_csv}' using 1:${i} with lines, '${input_csv}' using 1:${i}:$((i+1)) with yerrorbars,"
	done
	rm -f "$output"
	gnuplot << EOF
	set terminal png size 1000,800
	set key left top
	set xrange $range
	set ylabel "Response time (in seconds)"
	set xlabel "Raster size (nr of pixels)"
	set output '$output'
	set grid ytics mytics # draw lines for each ytics and mytics
	set mytics 2 # set the spacing for the mytics
	set grid # enable the grid
	set key autotitle columnhead
	set datafile separator ","
	${plot_command}
	EOF
	}

	process_stats
	plot "$CSVFILE" "${OUTPUT_DIR}/stats.png"
	# plot "$CSVFILE" "${OUTPUT_DIR}/0-500-stats.png" "[0:600]"
	# plot "$CSVFILE" "${OUTPUT_DIR}/0-1000-stats.png" "[0:1100]"
	# plot "$CSVFILE" "${OUTPUT_DIR}/1000-2000-stats.png" "[900:2100]"
	# plot "$CSVFILE" "${OUTPUT_DIR}/2000-4000-stats.png" "[1900:4100]"
	#!/usr/bin/env bash

	tempfile=$(mktemp)
	echo comp,size,min,max,median,avg,stddev >> $tempfile
	# combine all stats in one csv
	for c in none zstd deflate;do
	for s in 10 20 50 100 200 500 1000 1500 2000 3000 4000;do #4000
	file=output/ahn3_05m_dsm_${c}_stats_2_${s}.txt
	min=$(grep < "$file" "x 50" \| awk '{ print $3 }')
	max=$(grep < "$file" "x 50" \| awk '{ print $4 }')
	median=$(grep < "$file" "x 50" \| awk '{ print $5 }')
	avg=$(grep < "$file" "x 50" \| awk '{ print $6 }')
	stddev=$(grep < "$file" "x 50" \| awk '{ print $7 }')
	echo $c,$s,$min,$max,$median,$avg,$stddev >> $tempfile
	done
	done

	# transpose combined stats
	csv=$(printf '%s\n\n' "size,none-avg,none-stddev,deflate-avg,deflate-stddev,zstd-avg,zstd-stddev")
	for size in $(tail -n+2 $tempfile \| cut -d, -f2 \| sort -n -u);do
	size_output=$(mlr --csv filter '$size == '$size $tempfile)
	csv=$(printf '%s\n%s' "$csv" "${size}," )
	for comp in none deflate zstd;do
	comp_size_output=$(mlr --csv filter "\$comp == \"${comp}\"" <<< "$size_output")
	avg=$(mlr --csv --headerless-csv-output cut -f avg <<< "$comp_size_output")
	stddev=$(mlr --csv --headerless-csv-output cut -f stddev <<< "$comp_size_output")
	csv="${csv}${avg},${stddev},"
	done
	csv=$(printf '%s\n' "$csv")
	done

	OUTPUT_DIR="${1:-output}"
	output_file="${OUTPUT_DIR}/plot_stats.csv"
	echo "$csv" > "$output_file"
	#!/usr/bin/env bash
	# script to glue together benchmark.sh and plot.sh
	set -euo pipefail
	function run-benchmark(){
	OUTPUT_DIR="$1"
	COVERAGES="$2" # "ahn3_05m_dsm" \| "ahn3_05m_dsm_none ahn3_05m_dsm_deflate ahn3_05m_dsm_zstd"
	URL="$3" # "https://geodata.nationaalgeoregister.nl/ahn3/wcs" \| "https://service.pdok.nl/rws/ahn3/wcs/v1_0-preprod"
	RASTER_SIZES="${4:-10 20 50 100 200 500 1000 1500 2000 3000 4000}"
	NR_OF_REQS="${5:-50}"
	PARALLEL_REQS=${6:-2}

	rm -rf "$OUTPUT_DIR"
	./benchmark.sh "$RASTER_SIZES" "$OUTPUT_DIR" "$COVERAGES" "$URL" "$NR_OF_REQS" "$PARALLEL_REQS"
	./plot.sh "$OUTPUT_DIR" "$COVERAGES"
	}

	run-benchmark output-geotiff-nfs "ahn3_05m_dsm" https://geodata.nationaalgeoregister.nl/ahn3/wcs
	run-benchmark output-cog-s3-150 "ahn3_05m_dsm_none ahn3_05m_dsm_deflate ahn3_05m_dsm_zstd" https://service.pdok.nl/rws/ahn3/wcs/v1_0-preprod