Skip to content

Instantly share code, notes, and snippets.

@0x5742
Created April 6, 2016 18:43
Show Gist options
  • Select an option

  • Save 0x5742/6dabddf2f9c1dd4435db9a5704971b9f to your computer and use it in GitHub Desktop.

Select an option

Save 0x5742/6dabddf2f9c1dd4435db9a5704971b9f to your computer and use it in GitHub Desktop.
Unicode character lookup tool
#!/bin/sh
# Somewhat more convenient than grepping UnicodeData.txt directly.
UnicodeData="${HOME}/.local/share/UnicodeData.txt"
join() {
joiner="$1"; shift
output="$1"; shift
for arg in "$@"; do
output="$output$joiner$arg"
done
echo "$output"
}
eval set -- $(getopt -n "$0" FWwHhab "$@")
if [ "$?" -ne 0 ]; then
echo >&2 "usage: $0 [-F|-W|-w] [-H|-h] [-a|-b] search text ..."
exit 1
fi
# Defaults
fullname=n
bound=''
between='.*'
header=y
astral=y
while [ "$1" != '--' ]; do case "$1" in
-F) fullname=y; bound=''; between=' ' ;; # search for full name only
-W) fullname=n; bound=''; between='.*' ;; # no word boundary
-w) fullname=n; bound='\b'; between='.*' ;; # whole words only
-H) header=y ;; # display header line
-h) header=n ;; # omit header line
-a) astral=y ;; # list astral planes
-b) astral=n ;; # list basic multilingual only
esac; shift; done; shift
joiner="$bound$between$bound"
re="$bound"$(join "$joiner" "$@")"$bound"
if [ "$fullname" = "y" ]; then
re=";$re;"
fi
if [ "$astral" = "n" ]; then
re="^....(?=;).*$re"
fi
(
if [ "$header" == "y" ]; then
# 1 Code value
# 2 Character name
# 3 General category
# (Normative) (Informative)
# Lu Letter, Uppercase Lm Letter, Modifier
# Ll Letter, Lowercase Lo Letter, Other
# Lt Letter, Titlecase Pc Punctuation, Connector
# Mn Mark, Non-Spacing Pd Punctuation, Dash
# Mc Mark, Spacing Combining Ps Punctuation, Open
# Me Mark, Enclosing Pe Punctuation, Close
# Nd Number, Decimal Digit Pi Punctuation, Initial quote
# Nl Number, Letter Pf Punctuation, Final quote
# No Number, Other Po Punctuation, Other
# Zs Separator, Space Sm Symbol, Math
# Zl Separator, Line Sc Symbol, Currency
# Zp Separator, Paragraph Sk Symbol, Modifier
# Cc Other, Control So Symbol, Other
# Cf Other, Format
# Cs Other, Surrogate
# Co Other, Private Use
# Cn Other, Not Assigned
# 4 Canonical combining classes
# 5 Bidirectional category
# L Left-to-Right AN Arabic Number
# LRE Left-to-Right Embedding CS Common Number Separator
# LRO Left-to-Right Override NSM Non-Spacing Mark
# R Right-to-Left BN Boundary Neutral
# AR Right-to-Left Arabic B Paragraph Separator
# RLE Right-to-Left Embedding S Segment Separator
# RLO Right-to-Left Override WS Whitespace
# PDF Pop Directional Format ON Other Neutrals
# EN European Number
# ES European Number Separator
# ET European Number Terminator
# 6 Character decomposition mapping
# 7 Decimal digit value
# 8 Digit value
# 9 Numeric value
# 10 Mirrored?
# 11 Unicode 1.0 name
# 12 ISO 10646 commend
# 13 Uppercase mapping
# 14 Lowercase mapping
# 15 Titlecase mapping
echo 'Ch;Code;Character name;;;;;;;;;;;;;'
echo '--;----;--------------;;;;;;;;;;;;;'
fi
pcregrep -i "$re" "$UnicodeData"
) |\
perl -Mutf8 -CS -pe 's/^([0-9A-F]+);/sprintf "%s;U+%s;", chr(hex($1)), $1/ie' |\
cut -d';' -f 1,2,3 |\
column -ts';' | less -Fs
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment