-
-
Save 0x5742/6dabddf2f9c1dd4435db9a5704971b9f to your computer and use it in GitHub Desktop.
Unicode character lookup tool
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/sh | |
| # Somewhat more convenient than grepping UnicodeData.txt directly. | |
| UnicodeData="${HOME}/.local/share/UnicodeData.txt" | |
| join() { | |
| joiner="$1"; shift | |
| output="$1"; shift | |
| for arg in "$@"; do | |
| output="$output$joiner$arg" | |
| done | |
| echo "$output" | |
| } | |
| eval set -- $(getopt -n "$0" FWwHhab "$@") | |
| if [ "$?" -ne 0 ]; then | |
| echo >&2 "usage: $0 [-F|-W|-w] [-H|-h] [-a|-b] search text ..." | |
| exit 1 | |
| fi | |
| # Defaults | |
| fullname=n | |
| bound='' | |
| between='.*' | |
| header=y | |
| astral=y | |
| while [ "$1" != '--' ]; do case "$1" in | |
| -F) fullname=y; bound=''; between=' ' ;; # search for full name only | |
| -W) fullname=n; bound=''; between='.*' ;; # no word boundary | |
| -w) fullname=n; bound='\b'; between='.*' ;; # whole words only | |
| -H) header=y ;; # display header line | |
| -h) header=n ;; # omit header line | |
| -a) astral=y ;; # list astral planes | |
| -b) astral=n ;; # list basic multilingual only | |
| esac; shift; done; shift | |
| joiner="$bound$between$bound" | |
| re="$bound"$(join "$joiner" "$@")"$bound" | |
| if [ "$fullname" = "y" ]; then | |
| re=";$re;" | |
| fi | |
| if [ "$astral" = "n" ]; then | |
| re="^....(?=;).*$re" | |
| fi | |
| ( | |
| if [ "$header" == "y" ]; then | |
| # 1 Code value | |
| # 2 Character name | |
| # 3 General category | |
| # (Normative) (Informative) | |
| # Lu Letter, Uppercase Lm Letter, Modifier | |
| # Ll Letter, Lowercase Lo Letter, Other | |
| # Lt Letter, Titlecase Pc Punctuation, Connector | |
| # Mn Mark, Non-Spacing Pd Punctuation, Dash | |
| # Mc Mark, Spacing Combining Ps Punctuation, Open | |
| # Me Mark, Enclosing Pe Punctuation, Close | |
| # Nd Number, Decimal Digit Pi Punctuation, Initial quote | |
| # Nl Number, Letter Pf Punctuation, Final quote | |
| # No Number, Other Po Punctuation, Other | |
| # Zs Separator, Space Sm Symbol, Math | |
| # Zl Separator, Line Sc Symbol, Currency | |
| # Zp Separator, Paragraph Sk Symbol, Modifier | |
| # Cc Other, Control So Symbol, Other | |
| # Cf Other, Format | |
| # Cs Other, Surrogate | |
| # Co Other, Private Use | |
| # Cn Other, Not Assigned | |
| # 4 Canonical combining classes | |
| # 5 Bidirectional category | |
| # L Left-to-Right AN Arabic Number | |
| # LRE Left-to-Right Embedding CS Common Number Separator | |
| # LRO Left-to-Right Override NSM Non-Spacing Mark | |
| # R Right-to-Left BN Boundary Neutral | |
| # AR Right-to-Left Arabic B Paragraph Separator | |
| # RLE Right-to-Left Embedding S Segment Separator | |
| # RLO Right-to-Left Override WS Whitespace | |
| # PDF Pop Directional Format ON Other Neutrals | |
| # EN European Number | |
| # ES European Number Separator | |
| # ET European Number Terminator | |
| # 6 Character decomposition mapping | |
| # 7 Decimal digit value | |
| # 8 Digit value | |
| # 9 Numeric value | |
| # 10 Mirrored? | |
| # 11 Unicode 1.0 name | |
| # 12 ISO 10646 commend | |
| # 13 Uppercase mapping | |
| # 14 Lowercase mapping | |
| # 15 Titlecase mapping | |
| echo 'Ch;Code;Character name;;;;;;;;;;;;;' | |
| echo '--;----;--------------;;;;;;;;;;;;;' | |
| fi | |
| pcregrep -i "$re" "$UnicodeData" | |
| ) |\ | |
| perl -Mutf8 -CS -pe 's/^([0-9A-F]+);/sprintf "%s;U+%s;", chr(hex($1)), $1/ie' |\ | |
| cut -d';' -f 1,2,3 |\ | |
| column -ts';' | less -Fs |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment