Created
January 20, 2018 03:30
-
-
Save weibeld/d2a01b92e563112d0db4d8a2845e0602 to your computer and use it in GitHub Desktop.
Bash script for listing Unicode code points and their binary and hexadecimal representation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# Given a Unicode code point 'U' and a number 'n', print n lines for each of | |
# the Unicode code points 'U' to 'U+n', consisting of: | |
# | |
# 1. The character 'c' corresponding to 'U' | |
# 2. The Unicode code point 'U' | |
# 3. The binary code of 'c' in the terminal's current encoding | |
# 4. The hexadecimal code of 'c' in the terminal's current encoding | |
# | |
# Notes: | |
# - The maximal Unicode code point 'U+n' must be in the BMP (max. FFFF) | |
# - The Unicode code point 'U' is given as a hex. number (i.e. without U+) | |
# - Some non-printable chars like line-feed and tabs may garble the output | |
# - To make sense of the output, make sure to KNOW YOUR TERMINAL'S ENCODING | |
# - Run "locale", or check in the terminal's settings | |
# | |
# Daniel Weibel <[email protected]> 7 July 2017 | |
#------------------------------------------------------------------------------# | |
set -e | |
usage() { | |
cat <<EOF | |
Usage: | |
$(basename $0) code-point number | |
Example: | |
$(basename $0) c9 32 | |
EOF | |
} | |
# Get character corresponding to a Unicode code point in the BMP (4 hex digits) | |
get-char() { | |
perl -C -e "print chr 0x$1" | |
} | |
# Increment a hex number by 1, pad result with to 4 digits with 0s (reads arg) | |
increment() { | |
bc <<<"obase=ibase=16; $1 + 1" | |
} | |
# Pad a string to 4 digits with 0s (reads stdin) | |
pad-4() { | |
stdin=$(cat) | |
printf "%04s" "$stdin" | |
} | |
# Pad a string to multiples of 8 digits with 0s (reads stdin) | |
pad-8() { | |
stdin=$(cat) | |
n=$((((${#stdin}-1)/8 + 1) * 8)) | |
printf "%0${n}s" "$stdin" | |
} | |
# Get the hex code of a character in the terminal's encoding (reads arg) | |
encode() { | |
echo -n "$@" | hexdump | head -1 | cut -d ' ' -f 2- | sed 's/[[:space:]]*$//' | |
} | |
# Transform all lowercase letters of a string to uppercase (reads stdin) | |
to-upper() { | |
tr '[:lower:]' '[:upper:]' | |
} | |
# Remove all whitespace from a string (reads arg) | |
collapse() { | |
tr -d '[:blank:]' <<<"$1" | |
} | |
# Insert a space after every 8th character of a string (reads stdin) | |
split-bytes() { | |
sed 's/.\{8\}/& /g' | sed 's/ $//' | |
} | |
# Convert a hex number to binary (reads stdin) | |
to-binary() { | |
stdin=$(cat) | |
bc <<<"obase=2; ibase=16; $stdin" | |
} | |
if [[ "$#" -lt 2 ]]; then | |
usage | |
exit 1 | |
fi | |
# Read command-line args | |
code_point=$(echo $1 | to-upper | pad-4) | |
n=$2 | |
for i in $(seq 1 "$n"); do | |
char=$(get-char "$code_point") | |
hex_code=$(encode "$char" | to-upper) | |
bin_code=$(collapse "$hex_code" | to-binary | pad-8 | split-bytes) | |
echo "$char | U+$code_point | $bin_code | $hex_code" | |
code_point=$(increment "$code_point" | pad-4) | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment