Created
May 17, 2019 12:57
-
-
Save submachine/52b25cd150d9ecb0c9262217b38dc90b to your computer and use it in GitHub Desktop.
Test iconv with every possible input combination of two bytes for every supported character set
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh -f | |
# Run iconv(1) tests with every possible input combination of two bytes. | |
# Copyright (C) 2019 Free Software Foundation, Inc. | |
# This file is part of the GNU C Library. | |
# Contributed by Arjun Shankar <[email protected]>, 2019. | |
# The GNU C Library is free software; you can redistribute it and/or | |
# modify it under the terms of the GNU Lesser General Public | |
# License as published by the Free Software Foundation; either | |
# version 2.1 of the License, or (at your option) any later version. | |
# The GNU C Library is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
# Lesser General Public License for more details. | |
# You should have received a copy of the GNU Lesser General Public | |
# License along with the GNU C Library; if not, see | |
# <http://www.gnu.org/licenses/>. | |
# The gconv-modules file is supplied along with glibc and contains a list of | |
# all supported character set conversions; typically, each character set has | |
# two supported conversions, one from $charset to INTERNAL representation, | |
# and one from INTERNAL to $charset. The file also lists aliases for | |
# character sets on separate lines, which are just different names for the | |
# same charset. | |
if [ -f "/usr/lib64/gconv/gconv-modules" ]; then | |
modulelist_file="/usr/lib64/gconv/gconv-modules" | |
else | |
if [ -f "/usr/lib/gconv/gconv-modules" ]; then | |
modulelist_file="/usr/lib/gconv/gconv-modules" | |
else | |
echo "No module list found installed on system" | |
exit 1 | |
fi | |
fi | |
charset_list="$(cat $modulelist_file | | |
grep '^module' | # lines not starting with 'module' are aliases | |
grep -v '^module\s*INTERNAL' | # drop 'INTERNAL->module' lines | |
sed 's/\s\+/ /g' | # we don't want 'cut' below to deal with TABs | |
cut -d' ' -f2 | # grab the name of the character set on col. 2 | |
sed 's|//||' | | |
sort -u)" # some charsets have multiple direct conversions | |
# to other charsets for efficiency; drop those dupes | |
# List of known failures; tested preferentially before exhaustive 2 byte search | |
failarray=() | |
# Look at the return code ($ret) and say if the test passed or not | |
is_test_pass () | |
{ | |
# Normal (success/error) iconv run shouldn't return >127 | |
# except when: 124 is returned by `timeout' if iconv hangs | |
if [ "$ret" -gt "127" ] || [ "$ret" -eq "124" ]; then | |
# Failure | |
return 1 | |
else | |
# Success | |
return 0 | |
fi | |
} | |
# Logs the test result to stdout | |
log_result () | |
{ | |
if [ "$ret" -eq "124" ] || [ "$ret" -eq "137" ]; then # timeout/hang | |
result="HANG" | |
else | |
if [ "$ret" -eq "139" ]; then # segfault | |
result="SEGFAULT" | |
else | |
if [ "$ret" -gt "127" ]; then # unexpected error | |
result="UNEXPECTED" | |
else | |
result="OK" | |
fi | |
fi | |
fi | |
echo -n "$result: " | |
if [ "$result" = "OK" ]; then | |
echo $charset | |
else | |
echo "$charset; echo -en \"$twobyte\" | iconv $c -f $charset -t \"$to_cs\"" | |
fi | |
} | |
# Requires $twobyte input, $c flag, $charset, and $to_cs to be set; sets $ret | |
execute_test () | |
{ | |
echo -en "$twobyte" | | |
timeout -k 4 3 iconv $c -f $charset -t "$to_cs" &>/dev/null | |
ret=$? | |
} | |
# Main test loop | |
for charset in $charset_list; do | |
# First run all cached test failures from previous charsets | |
for failcommand in "${failarray[@]}"; do | |
echo "$failcommand" | IFS=";" read twobyte c to_cs | |
execute_test | |
if ! is_test_pass; then | |
break | |
fi | |
done | |
if [ ${#failarray[@]} -ne 0 ] && ! is_test_pass; then | |
log_result | |
continue | |
fi | |
# Then run an exhaustive search using all 2-byte input combinations | |
for b1 in $(seq 0 255); do | |
for b2 in $(seq 0 255); do | |
twobyte="$(printf "\\\x%02x\\\x%02x" $b1 $b2)" | |
for c in "" "-c"; do # "ignore" passed as an option | |
for i in "" "//IGNORE"; do # "ignore" passed as a suffix | |
for t in "" "//TRANSLIT"; do # "transliterate" passed as a suffix | |
# When both TRANSLIT and IGNORE are ON, we test two times: | |
if [ -n "$i" ] && [ -n "$t" ]; then | |
# First we test with "//IGNORE//TRANSLIT" | |
to_cs="UTF-8$i$t" | |
execute_test | |
if is_test_pass; then | |
# Then we test with "//TRANSLIT//IGNORE" | |
to_cs="UTF-8$t$i" | |
execute_test | |
fi | |
else | |
# Otherwise, we test only once: | |
to_cs="UTF-8$t$i" | |
execute_test | |
fi | |
if ! is_test_pass; then | |
# Cache the failed commandline to speed up future runs | |
failarray+=("$twobyte;$c;$to_cs") | |
# Stop testing this charset and go down to log an error | |
break 5 | |
fi | |
done #1 (t) | |
done #2 (i) | |
done #3 (c) | |
done #4 (b2) | |
done #5 (b1) | |
log_result | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment