submachine · May 17, 2019 12:57
diff --git a/iconv-2byte-fuzzer.sh b/iconv-2byte-fuzzer.sh
 #!/bin/sh -f
 # Run iconv(1) tests with every possible input combination of two bytes.
 # Copyright (C) 2019 Free Software Foundation, Inc.
 # This file is part of the GNU C Library.
 # Contributed by Arjun Shankar <[email protected]>, 2019.

 # The GNU C Library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
 # License as published by the Free Software Foundation; either
 # version 2.1 of the License, or (at your option) any later version.

 # The GNU C Library is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # Lesser General Public License for more details.

 # You should have received a copy of the GNU Lesser General Public
 # License along with the GNU C Library; if not, see
 # <http://www.gnu.org/licenses/>.


 # The gconv-modules file is supplied along with glibc and contains a list of
 # all supported character set conversions; typically, each character set has
 # two supported conversions, one from $charset to INTERNAL representation,
 # and one from INTERNAL to $charset. The file also lists aliases for
 # character sets on separate lines, which are just different names for the
 # same charset.

 if [ -f "/usr/lib64/gconv/gconv-modules" ]; then
  modulelist_file="/usr/lib64/gconv/gconv-modules"
 else
  if [ -f "/usr/lib/gconv/gconv-modules" ]; then
    modulelist_file="/usr/lib/gconv/gconv-modules"
  else
    echo "No module list found installed on system"
    exit 1
  fi
 fi

 charset_list="$(cat $modulelist_file |
                grep '^module' | # lines not starting with 'module' are aliases
                grep -v '^module\s*INTERNAL' | # drop 'INTERNAL->module' lines
                sed 's/\s\+/ /g' | # we don't want 'cut' below to deal with TABs
                cut -d' ' -f2 | # grab the name of the character set on col. 2
                sed 's|//||' |
                sort -u)" # some charsets have multiple direct conversions
                          # to other charsets for efficiency; drop those dupes


 # List of known failures; tested preferentially before exhaustive 2 byte search
 failarray=()


 # Look at the return code ($ret) and say if the test passed or not
 is_test_pass ()
 {
  # Normal (success/error) iconv run shouldn't return >127
  # except when: 124 is returned by `timeout' if iconv hangs

  if [ "$ret" -gt "127" ] || [ "$ret" -eq "124" ]; then
    # Failure
    return 1
  else
    # Success
    return 0
  fi
 }


 # Logs the test result to stdout
 log_result ()
 {
  if [ "$ret" -eq "124" ] || [ "$ret" -eq "137" ]; then # timeout/hang
    result="HANG"
  else
    if [ "$ret" -eq "139" ]; then # segfault
      result="SEGFAULT"
    else
      if [ "$ret" -gt "127" ]; then # unexpected error
        result="UNEXPECTED"
      else
        result="OK"
      fi
    fi
  fi

  echo -n "$result: "
  if [ "$result" = "OK" ]; then
    echo $charset
  else
    echo "$charset; echo -en \"$twobyte\" | iconv $c -f $charset -t \"$to_cs\""
  fi
 }


 # Requires $twobyte input, $c flag, $charset, and $to_cs to be set; sets $ret
 execute_test ()
 {
  echo -en "$twobyte" |
  timeout -k 4 3 iconv $c -f $charset -t "$to_cs" &>/dev/null
  ret=$?
 }


 # Main test loop
 for charset in $charset_list; do

  # First run all cached test failures from previous charsets
  for failcommand in "${failarray[@]}"; do

    echo "$failcommand" | IFS=";" read twobyte c to_cs
    execute_test
    if ! is_test_pass; then
      break
    fi
  done

  if  [ ${#failarray[@]} -ne 0 ] && ! is_test_pass; then
    log_result
    continue
  fi

  # Then run an exhaustive search using all 2-byte input combinations
  for b1 in $(seq 0 255); do
    for b2 in $(seq 0 255); do

      twobyte="$(printf "\\\x%02x\\\x%02x" $b1 $b2)"

      for c in "" "-c"; do # "ignore" passed as an option
        for i in "" "//IGNORE"; do # "ignore" passed as a suffix
          for t in "" "//TRANSLIT"; do # "transliterate" passed as a suffix

            # When both TRANSLIT and IGNORE are ON, we test two times:
            if [ -n "$i" ] && [ -n "$t" ]; then

              # First we test with "//IGNORE//TRANSLIT"
              to_cs="UTF-8$i$t"
              execute_test

              if is_test_pass; then

                # Then we test with "//TRANSLIT//IGNORE"
                to_cs="UTF-8$t$i"
                execute_test
              fi

            else
              # Otherwise, we test only once:
              to_cs="UTF-8$t$i"
              execute_test
            fi

            if ! is_test_pass; then

              # Cache the failed commandline to speed up future runs
              failarray+=("$twobyte;$c;$to_cs")

              # Stop testing this charset and go down to log an error
              break 5
            fi
          done #1 (t)
        done #2 (i)
      done #3 (c)
    done #4 (b2)
  done #5 (b1)

  log_result
 done
	#!/bin/sh -f
	# Run iconv(1) tests with every possible input combination of two bytes.
	# Copyright (C) 2019 Free Software Foundation, Inc.
	# This file is part of the GNU C Library.
	# Contributed by Arjun Shankar <[email protected]>, 2019.

	# The GNU C Library is free software; you can redistribute it and/or
	# modify it under the terms of the GNU Lesser General Public
	# License as published by the Free Software Foundation; either
	# version 2.1 of the License, or (at your option) any later version.

	# The GNU C Library is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	# Lesser General Public License for more details.

	# You should have received a copy of the GNU Lesser General Public
	# License along with the GNU C Library; if not, see
	# <http://www.gnu.org/licenses/>.


	# The gconv-modules file is supplied along with glibc and contains a list of
	# all supported character set conversions; typically, each character set has
	# two supported conversions, one from $charset to INTERNAL representation,
	# and one from INTERNAL to $charset. The file also lists aliases for
	# character sets on separate lines, which are just different names for the
	# same charset.

	if [ -f "/usr/lib64/gconv/gconv-modules" ]; then
	modulelist_file="/usr/lib64/gconv/gconv-modules"
	else
	if [ -f "/usr/lib/gconv/gconv-modules" ]; then
	modulelist_file="/usr/lib/gconv/gconv-modules"
	else
	echo "No module list found installed on system"
	exit 1
	fi
	fi

	charset_list="$(cat $modulelist_file \|
	grep '^module' \| # lines not starting with 'module' are aliases
	grep -v '^module\s*INTERNAL' \| # drop 'INTERNAL->module' lines
	sed 's/\s\+/ /g' \| # we don't want 'cut' below to deal with TABs
	cut -d' ' -f2 \| # grab the name of the character set on col. 2
	sed 's\|//\|\|' \|
	sort -u)" # some charsets have multiple direct conversions
	# to other charsets for efficiency; drop those dupes


	# List of known failures; tested preferentially before exhaustive 2 byte search
	failarray=()


	# Look at the return code ($ret) and say if the test passed or not
	is_test_pass ()
	{
	# Normal (success/error) iconv run shouldn't return >127
	# except when: 124 is returned by `timeout' if iconv hangs

	if [ "$ret" -gt "127" ] \|\| [ "$ret" -eq "124" ]; then
	# Failure
	return 1
	else
	# Success
	return 0
	fi
	}


	# Logs the test result to stdout
	log_result ()
	{
	if [ "$ret" -eq "124" ] \|\| [ "$ret" -eq "137" ]; then # timeout/hang
	result="HANG"
	else
	if [ "$ret" -eq "139" ]; then # segfault
	result="SEGFAULT"
	else
	if [ "$ret" -gt "127" ]; then # unexpected error
	result="UNEXPECTED"
	else
	result="OK"
	fi
	fi
	fi

	echo -n "$result: "
	if [ "$result" = "OK" ]; then
	echo $charset
	else
	echo "$charset; echo -en \"$twobyte\" \| iconv $c -f $charset -t \"$to_cs\""
	fi
	}


	# Requires $twobyte input, $c flag, $charset, and $to_cs to be set; sets $ret
	execute_test ()
	{
	echo -en "$twobyte" \|
	timeout -k 4 3 iconv $c -f $charset -t "$to_cs" &>/dev/null
	ret=$?
	}


	# Main test loop
	for charset in $charset_list; do

	# First run all cached test failures from previous charsets
	for failcommand in "${failarray[@]}"; do

	echo "$failcommand" \| IFS=";" read twobyte c to_cs
	execute_test
	if ! is_test_pass; then
	break
	fi
	done

	if [ ${#failarray[@]} -ne 0 ] && ! is_test_pass; then
	log_result
	continue
	fi

	# Then run an exhaustive search using all 2-byte input combinations
	for b1 in $(seq 0 255); do
	for b2 in $(seq 0 255); do

	twobyte="$(printf "\\\x%02x\\\x%02x" $b1 $b2)"

	for c in "" "-c"; do # "ignore" passed as an option
	for i in "" "//IGNORE"; do # "ignore" passed as a suffix
	for t in "" "//TRANSLIT"; do # "transliterate" passed as a suffix

	# When both TRANSLIT and IGNORE are ON, we test two times:
	if [ -n "$i" ] && [ -n "$t" ]; then

	# First we test with "//IGNORE//TRANSLIT"
	to_cs="UTF-8$i$t"
	execute_test

	if is_test_pass; then

	# Then we test with "//TRANSLIT//IGNORE"
	to_cs="UTF-8$t$i"
	execute_test
	fi

	else
	# Otherwise, we test only once:
	to_cs="UTF-8$t$i"
	execute_test
	fi

	if ! is_test_pass; then

	# Cache the failed commandline to speed up future runs
	failarray+=("$twobyte;$c;$to_cs")

	# Stop testing this charset and go down to log an error
	break 5
	fi
	done #1 (t)
	done #2 (i)
	done #3 (c)
	done #4 (b2)
	done #5 (b1)

	log_result
	done