Last active
January 5, 2018 15:57
-
-
Save hartfordfive/67d81b0df19f4df95f4450b86576fcac to your computer and use it in GitHub Desktop.
Bash functions to get the encoding number of chars of a given text file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function filereport() { | |
if [ "$1" != "run" ]; then | |
echo "Notice Dry-run mode" | |
MODE="dryrun" | |
else | |
MODE="run" | |
fi | |
SERVICE_NAME=$(basename `pwd`) | |
BASE_DIR=pwd | |
REPORTS_DIR=~/reports/$SERVICE_NAME | |
# find . -maxdepth 1 -mindepth 1 -regex '.*[0-9]+' -type d -regextype posix-egrep -regex '[^0-9]+2017.*' -printf '%f\n' | |
DATES=$(find . -maxdepth 1 -mindepth 1 -regex '.*[0-9]+' -type d -printf '%f\n' | sort | tac) | |
if [ ! -d ~/$SERVICE_NAME ]; then | |
if [ "$MODE" != "dryrun" ]; then | |
mkdir -p $REPORTS_DIR | |
else | |
echo "Would create dir: $REPORTS_DIR" | |
fi | |
fi | |
for dt in $DATES | |
do | |
#FILES=$(find $dt/ -name "*.bz2" -not -name "*pid.bz2" -not -name "*.py.bz2" -type f -regex '.*[0-9]+$') | |
FILES=$(find $dt/ -name "*.bz2" -not -name "*pid.bz2" -not -name "*.py.bz2" -not -name "*.hdr.bz2" -type f -not -size 0 -size +50c) | |
if [ "$MODE" != "dryrun" ]; then | |
echo "file,encoding,total_lines,total_characters,avg_line_size_bytes,avg_line_size_human" >> $REPORTS_DIR/filesize_report_$dt.txt | |
fi | |
for file in $FILES | |
do | |
echo "Getting details for $file" | |
if [ "$MODE" != "dryrun" ]; then | |
filedetails $file >> $REPORTS_DIR/filesize_report_$dt.txt | |
else | |
echo " Would run: filedetails $file >> $REPORTS_DIR/filesize_report_$dt.txt" | |
fi | |
done | |
done | |
} | |
function filedetails() { | |
file=$1 | |
filename="${file##*/}" # Strip longest match of */ from start | |
dir="${fullpath:0:${#fullpath} - ${#filename}}" # Substring from 0 thru pos of filename | |
base="${filename%.[^.]*}" # Strip shortest match of . plus at least one non-dot char from end | |
ext="${filename:${#base} + 1}" # Substring from len of base thru end | |
if [[ -z "$base" && -n "$ext" ]]; then # If we have an extension and no base, it's really the base | |
base=".$ext" | |
ext="" | |
fi | |
ENC=$(getenc $1) | |
printf "$file,$ENC," | |
if [ "$ext" == "bz2" ]; then | |
CMD=bzcat | |
else | |
CMD=cat | |
fi | |
$CMD $1 | awk ' | |
function human_size(bytes, type) | |
{ | |
split( "B KB MB GB" , v ); | |
s=1; | |
while( bytes>=1024 ){ | |
bytes/=1024; | |
s++ | |
} | |
if (type == "unit") | |
return v[s] | |
else if ( type == "size") | |
return sprintf("%.1f",bytes) | |
else | |
return (bytes v[s]) | |
} | |
{ | |
sum += length($0); | |
cnt++ | |
} | |
END { | |
if (sum == 0 || cnt == 0) | |
size = 0 | |
else | |
size = sum/cnt | |
print cnt "," sum "," sprintf("%.f",size) "," human_size(size, "size") human_size(size, "unit") | |
} | |
' | |
} | |
function getenc() { | |
file -i $1 | awk '{print $3}' | cut -d'=' -f 2 | |
} | |
function hrsize { | |
echo $1 | awk '{ split( "B KB MB GB" , v ); s=1; while( $1>=1024 ){ $1/=1024; s++ } printf "%.2f",$1; print v[s] }' | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment