0x5742 · April 6, 2016 18:43
diff --git a/uni b/uni
 #!/bin/sh
 # Somewhat more convenient than grepping UnicodeData.txt directly.

 UnicodeData="${HOME}/.local/share/UnicodeData.txt"

 join() {
    joiner="$1"; shift
    output="$1"; shift
    for arg in "$@"; do
        output="$output$joiner$arg"
    done
    echo "$output"
 }

 eval set -- $(getopt -n "$0" FWwHhab "$@")
 if [ "$?" -ne 0 ]; then
    echo >&2 "usage: $0 [-F|-W|-w] [-H|-h] [-a|-b] search text ..."
    exit 1
 fi

 # Defaults
 fullname=n
 bound=''
 between='.*'
 header=y
 astral=y

 while [ "$1" != '--' ]; do case "$1" in
    -F) fullname=y; bound='';   between=' '  ;; # search for full name only
    -W) fullname=n; bound='';   between='.*' ;; # no word boundary
    -w) fullname=n; bound='\b'; between='.*' ;; # whole words only
    -H) header=y                             ;; # display header line
    -h) header=n                             ;; # omit header line
    -a) astral=y                             ;; # list astral planes
    -b) astral=n                             ;; # list basic multilingual only
 esac; shift; done; shift

 joiner="$bound$between$bound"
 re="$bound"$(join "$joiner" "$@")"$bound"
 if [ "$fullname" = "y" ]; then
    re=";$re;"
 fi

 if [ "$astral" = "n" ]; then
    re="^....(?=;).*$re"
 fi

 (
    if [ "$header" == "y" ]; then
        #  1 Code value
        #  2 Character name
        #  3 General category
        #       (Normative)                         (Informative)
        #       Lu   Letter, Uppercase              Lm   Letter, Modifier
        #       Ll   Letter, Lowercase              Lo   Letter, Other
        #       Lt   Letter, Titlecase              Pc   Punctuation, Connector
        #       Mn   Mark, Non-Spacing              Pd   Punctuation, Dash
        #       Mc   Mark, Spacing Combining        Ps   Punctuation, Open
        #       Me   Mark, Enclosing                Pe   Punctuation, Close
        #       Nd   Number, Decimal Digit          Pi   Punctuation, Initial quote
        #       Nl   Number, Letter                 Pf   Punctuation, Final quote
        #       No   Number, Other                  Po   Punctuation, Other
        #       Zs   Separator, Space               Sm   Symbol, Math
        #       Zl   Separator, Line                Sc   Symbol, Currency
        #       Zp   Separator, Paragraph           Sk   Symbol, Modifier
        #       Cc   Other, Control                 So   Symbol, Other
        #       Cf   Other, Format
        #       Cs   Other, Surrogate
        #       Co   Other, Private Use
        #       Cn   Other, Not Assigned
        #  4 Canonical combining classes
        #  5 Bidirectional category
        #       L    Left-to-Right                  AN   Arabic Number
        #       LRE  Left-to-Right Embedding        CS   Common Number Separator
        #       LRO  Left-to-Right Override         NSM  Non-Spacing Mark
        #       R    Right-to-Left                  BN   Boundary Neutral
        #       AR   Right-to-Left Arabic           B    Paragraph Separator
        #       RLE  Right-to-Left Embedding        S    Segment Separator
        #       RLO  Right-to-Left Override         WS   Whitespace
        #       PDF  Pop Directional Format         ON   Other Neutrals
        #       EN   European Number
        #       ES   European Number Separator
        #       ET   European Number Terminator
        #  6 Character decomposition mapping
        #  7 Decimal digit value
        #  8 Digit value
        #  9 Numeric value
        # 10 Mirrored?
        # 11 Unicode 1.0 name
        # 12 ISO 10646 commend
        # 13 Uppercase mapping
        # 14 Lowercase mapping
        # 15 Titlecase mapping

        echo 'Ch;Code;Character name;;;;;;;;;;;;;'
        echo '--;----;--------------;;;;;;;;;;;;;'
    fi
    pcregrep -i "$re" "$UnicodeData"
 ) |\
    perl -Mutf8 -CS -pe 's/^([0-9A-F]+);/sprintf "%s;U+%s;", chr(hex($1)), $1/ie' |\
    cut -d';' -f 1,2,3 |\
    column -ts';' | less -Fs
	#!/bin/sh
	# Somewhat more convenient than grepping UnicodeData.txt directly.

	UnicodeData="${HOME}/.local/share/UnicodeData.txt"

	join() {
	joiner="$1"; shift
	output="$1"; shift
	for arg in "$@"; do
	output="$output$joiner$arg"
	done
	echo "$output"
	}

	eval set -- $(getopt -n "$0" FWwHhab "$@")
	if [ "$?" -ne 0 ]; then
	echo >&2 "usage: $0 [-F\|-W\|-w] [-H\|-h] [-a\|-b] search text ..."
	exit 1
	fi

	# Defaults
	fullname=n
	bound=''
	between='.*'
	header=y
	astral=y

	while [ "$1" != '--' ]; do case "$1" in
	-F) fullname=y; bound=''; between=' ' ;; # search for full name only
	-W) fullname=n; bound=''; between='.*' ;; # no word boundary
	-w) fullname=n; bound='\b'; between='.*' ;; # whole words only
	-H) header=y ;; # display header line
	-h) header=n ;; # omit header line
	-a) astral=y ;; # list astral planes
	-b) astral=n ;; # list basic multilingual only
	esac; shift; done; shift

	joiner="$bound$between$bound"
	re="$bound"$(join "$joiner" "$@")"$bound"
	if [ "$fullname" = "y" ]; then
	re=";$re;"
	fi

	if [ "$astral" = "n" ]; then
	re="^....(?=;).*$re"
	fi

	(
	if [ "$header" == "y" ]; then
	# 1 Code value
	# 2 Character name
	# 3 General category
	# (Normative) (Informative)
	# Lu Letter, Uppercase Lm Letter, Modifier
	# Ll Letter, Lowercase Lo Letter, Other
	# Lt Letter, Titlecase Pc Punctuation, Connector
	# Mn Mark, Non-Spacing Pd Punctuation, Dash
	# Mc Mark, Spacing Combining Ps Punctuation, Open
	# Me Mark, Enclosing Pe Punctuation, Close
	# Nd Number, Decimal Digit Pi Punctuation, Initial quote
	# Nl Number, Letter Pf Punctuation, Final quote
	# No Number, Other Po Punctuation, Other
	# Zs Separator, Space Sm Symbol, Math
	# Zl Separator, Line Sc Symbol, Currency
	# Zp Separator, Paragraph Sk Symbol, Modifier
	# Cc Other, Control So Symbol, Other
	# Cf Other, Format
	# Cs Other, Surrogate
	# Co Other, Private Use
	# Cn Other, Not Assigned
	# 4 Canonical combining classes
	# 5 Bidirectional category
	# L Left-to-Right AN Arabic Number
	# LRE Left-to-Right Embedding CS Common Number Separator
	# LRO Left-to-Right Override NSM Non-Spacing Mark
	# R Right-to-Left BN Boundary Neutral
	# AR Right-to-Left Arabic B Paragraph Separator
	# RLE Right-to-Left Embedding S Segment Separator
	# RLO Right-to-Left Override WS Whitespace
	# PDF Pop Directional Format ON Other Neutrals
	# EN European Number
	# ES European Number Separator
	# ET European Number Terminator
	# 6 Character decomposition mapping
	# 7 Decimal digit value
	# 8 Digit value
	# 9 Numeric value
	# 10 Mirrored?
	# 11 Unicode 1.0 name
	# 12 ISO 10646 commend
	# 13 Uppercase mapping
	# 14 Lowercase mapping
	# 15 Titlecase mapping

	echo 'Ch;Code;Character name;;;;;;;;;;;;;'
	echo '--;----;--------------;;;;;;;;;;;;;'
	fi
	pcregrep -i "$re" "$UnicodeData"
	) \|\
	perl -Mutf8 -CS -pe 's/^([0-9A-F]+);/sprintf "%s;U+%s;", chr(hex($1)), $1/ie' \|\
	cut -d';' -f 1,2,3 \|\
	column -ts';' \| less -Fs
No results found