rgarner · June 27, 2014 17:46
diff --git a/fix.sh b/fix.sh
 ##
 # Find CDN output that has invalid UTF8 byte sequences.
 # Fix them.
 #
 # NB: This requires an iconv that supports --byte-subst
 #

 FAILURES=utf8_failures
 FAILURE_STRING=__WUBWUB__

 trap "rm $FAILURES; exit" SIGHUP SIGINT SIGTERM

 echo "Finding failures"
 rm $FAILURES

 # Find duff files by attempting to convert.
 # Take stderr failures and pipe them to ./$FAILURES
 for f in *.tsv
 do
  (iconv -t UTF8 $f 2>&1 >/dev/null | grep -Po "www.*\.tsv") >> $FAILURES
 done

 echo "Fixing failures"
 # Look at the failures and substitute a failure string
 # Discard lines with failures
 for f in `cat $FAILURES`
 do
  iconv --byte-subst=$FAILURE_STRING -f UTF8 -t UTF8 $f > "$f.new"
  cat "$f.new" | (mawk "\$0 !~ /$FAILURE_STRING/" > $f) && rm "$f.new"
 done

 rm $FAILURES
	##
	# Find CDN output that has invalid UTF8 byte sequences.
	# Fix them.
	#
	# NB: This requires an iconv that supports --byte-subst
	#

	FAILURES=utf8_failures
	FAILURE_STRING=__WUBWUB__

	trap "rm $FAILURES; exit" SIGHUP SIGINT SIGTERM

	echo "Finding failures"
	rm $FAILURES

	# Find duff files by attempting to convert.
	# Take stderr failures and pipe them to ./$FAILURES
	for f in *.tsv
	do
	(iconv -t UTF8 $f 2>&1 >/dev/null \| grep -Po "www.*\.tsv") >> $FAILURES
	done

	echo "Fixing failures"
	# Look at the failures and substitute a failure string
	# Discard lines with failures
	for f in `cat $FAILURES`
	do
	iconv --byte-subst=$FAILURE_STRING -f UTF8 -t UTF8 $f > "$f.new"
	cat "$f.new" \| (mawk "\$0 !~ /$FAILURE_STRING/" > $f) && rm "$f.new"
	done

	rm $FAILURES