Skip to content

Instantly share code, notes, and snippets.

@kokosing
Last active July 5, 2018 11:16
Show Gist options
  • Save kokosing/14575ee82b2eb682182d46525327311f to your computer and use it in GitHub Desktop.
Save kokosing/14575ee82b2eb682182d46525327311f to your computer and use it in GitHub Desktop.
Statistics json file generator from Presto hive stats
#!/bin/bash
set -euo pipefail
function generate() {
local stats_file=$1
local json_file=`echo $1 | sed 's/.stats//'`.json
while read line; do
line=$(echo $line | sed 's/"//g')
local rows=$(echo $line | cut -d, -f5)
if [[ ! -z $rows ]]; then
break
fi
done < $stats_file
cat << EOF > $json_file
{
"rowCount" : $rows,
"columns" : {
EOF
while read line; do
line=$(echo $line | sed 's/"//g')
local columnName=$(echo $line | cut -d, -f1)
local dataSize=$(echo $line | cut -d, -f2)
local ndv=$(echo $line | cut -d, -f3)
local nulls=$(echo $line | cut -d, -f4)
if [[ ! -z $nulls ]]; then
nulls=$(python -c "print($nulls*$rows)")
fi
local min=$(echo $line | cut -d, -f6 | sed 's/^\./0./' | sed 's/\.//' | sed 's/^0*//')
local max=$(echo $line | cut -d, -f7 | sed 's/^\./0./' | sed 's/\.//' | sed 's/^0*//')
if [[ -z $columnName ]]; then
cat << EOF >> $json_file
"dummyColumn" : {}
}
}
EOF
else
cat << EOF >> $json_file
"$columnName" : {
EOF
if [[ ! -z $dataSize ]]; then
cat << EOF >> $json_file
"dataSize" : $dataSize,
EOF
fi
if [[ -z $min ]]; then
cat << EOF >> $json_file
"nullsCount" : $nulls,
"distinctValuesCount" : $ndv
},
EOF
elif (echo $min | grep -q '.-..-') ; then
cat << EOF >> $json_file
"distinctValuesCount" : $ndv,
"nullsCount" : $nulls,
"min" : "$min",
"max" : "$max"
},
EOF
else
cat << EOF >> $json_file
"distinctValuesCount" : $ndv,
"nullsCount" : $nulls,
"min" : $min,
"max" : $max
},
EOF
fi
fi
done < $stats_file
}
function presto_cli() {
~/presto/presto-cli/target/presto-cli-*-executable.jar "${@}"
}
#server="${1}"
#schema="${2}"
#
#set -x
#presto_cli --server "${server}" --execute "show tables from hive.${schema}" --output-format TSV > tables.txt
#
#while read table; do
# presto_cli --server "${server}" --execute "show stats for hive.${schema}.${table}" > "${table}".stats
#done < tables.txt
#set +x
for f in *stats; do
echo $f
generate $f
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment