Skip to content

Instantly share code, notes, and snippets.

@hartfordfive
Last active January 5, 2018 15:57
Show Gist options
  • Save hartfordfive/67d81b0df19f4df95f4450b86576fcac to your computer and use it in GitHub Desktop.
Save hartfordfive/67d81b0df19f4df95f4450b86576fcac to your computer and use it in GitHub Desktop.
Bash functions to get the encoding number of chars of a given text file
function filereport() {
if [ "$1" != "run" ]; then
echo "Notice Dry-run mode"
MODE="dryrun"
else
MODE="run"
fi
SERVICE_NAME=$(basename `pwd`)
BASE_DIR=pwd
REPORTS_DIR=~/reports/$SERVICE_NAME
# find . -maxdepth 1 -mindepth 1 -regex '.*[0-9]+' -type d -regextype posix-egrep -regex '[^0-9]+2017.*' -printf '%f\n'
DATES=$(find . -maxdepth 1 -mindepth 1 -regex '.*[0-9]+' -type d -printf '%f\n' | sort | tac)
if [ ! -d ~/$SERVICE_NAME ]; then
if [ "$MODE" != "dryrun" ]; then
mkdir -p $REPORTS_DIR
else
echo "Would create dir: $REPORTS_DIR"
fi
fi
for dt in $DATES
do
#FILES=$(find $dt/ -name "*.bz2" -not -name "*pid.bz2" -not -name "*.py.bz2" -type f -regex '.*[0-9]+$')
FILES=$(find $dt/ -name "*.bz2" -not -name "*pid.bz2" -not -name "*.py.bz2" -not -name "*.hdr.bz2" -type f -not -size 0 -size +50c)
if [ "$MODE" != "dryrun" ]; then
echo "file,encoding,total_lines,total_characters,avg_line_size_bytes,avg_line_size_human" >> $REPORTS_DIR/filesize_report_$dt.txt
fi
for file in $FILES
do
echo "Getting details for $file"
if [ "$MODE" != "dryrun" ]; then
filedetails $file >> $REPORTS_DIR/filesize_report_$dt.txt
else
echo " Would run: filedetails $file >> $REPORTS_DIR/filesize_report_$dt.txt"
fi
done
done
}
function filedetails() {
file=$1
filename="${file##*/}" # Strip longest match of */ from start
dir="${fullpath:0:${#fullpath} - ${#filename}}" # Substring from 0 thru pos of filename
base="${filename%.[^.]*}" # Strip shortest match of . plus at least one non-dot char from end
ext="${filename:${#base} + 1}" # Substring from len of base thru end
if [[ -z "$base" && -n "$ext" ]]; then # If we have an extension and no base, it's really the base
base=".$ext"
ext=""
fi
ENC=$(getenc $1)
printf "$file,$ENC,"
if [ "$ext" == "bz2" ]; then
CMD=bzcat
else
CMD=cat
fi
$CMD $1 | awk '
function human_size(bytes, type)
{
split( "B KB MB GB" , v );
s=1;
while( bytes>=1024 ){
bytes/=1024;
s++
}
if (type == "unit")
return v[s]
else if ( type == "size")
return sprintf("%.1f",bytes)
else
return (bytes v[s])
}
{
sum += length($0);
cnt++
}
END {
if (sum == 0 || cnt == 0)
size = 0
else
size = sum/cnt
print cnt "," sum "," sprintf("%.f",size) "," human_size(size, "size") human_size(size, "unit")
}
'
}
function getenc() {
file -i $1 | awk '{print $3}' | cut -d'=' -f 2
}
function hrsize {
echo $1 | awk '{ split( "B KB MB GB" , v ); s=1; while( $1>=1024 ){ $1/=1024; s++ } printf "%.2f",$1; print v[s] }'
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment