Last active
May 6, 2022 11:56
-
-
Save h20y6m/6449d1d5d29a71620d19f29f881a0549 to your computer and use it in GitHub Desktop.
l3str-convert with (u)pLaTeX
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%#!platex -no-guess-input-enc -kanji=utf8 | |
% -*- coding: utf-8 -*- | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
% l3names | |
% | |
\ExplSyntaxOn | |
% \tex_toucs:D already part of expl3 <2022-04-10> | |
%\tex_global:D \tex_let:D \tex_toucs:D \toucs | |
\ExplSyntaxOff | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
% l3str-convert | |
% | |
\ExplSyntaxOn | |
\cs_new:Npn \__str_encode_if_extended_char_p:n #1 { \c_false_bool } | |
\cs_new:Npn \__str_encode_extended_char:n #1 { } | |
\cs_gset:Npn \__str_encode_utf_viii_char:n #1 | |
{ | |
\if_predicate:w \__str_encode_if_extended_char_p:n {#1} | |
\__str_encode_extended_char:n {#1} | |
\else: | |
\__str_encode_utf_viii_loop:wwnnw #1 ; - 1 + 0 * ; | |
{ 128 } { 0 } | |
{ 32 } { 192 } | |
{ 16 } { 224 } | |
{ 8 } { 240 } | |
\s__str_stop | |
\fi: | |
} | |
\cs_new:Npn \__str_decode_if_extended_char_p:N #1 { \c_false_bool } | |
\cs_new:Npn \__str_decode_extended_char:N #1 { } | |
\cs_gset:Npn \__str_decode_utf_viii_start:N #1 | |
{ | |
#1 | |
\if_predicate:w \__str_decode_if_extended_char_p:N #1 | |
\s__str | |
\__str_decode_extended_char:N #1 | |
\else: | |
\if_int_compare:w `#1 < "C0 \exp_stop_f: | |
\s__str | |
\if_int_compare:w `#1 < "80 \exp_stop_f: | |
\int_value:w `#1 | |
\else: | |
\flag_raise:n { str_extra } | |
\flag_raise:n { str_error } | |
\int_use:N \c__str_replacement_char_int | |
\fi: | |
\else: | |
\exp_after:wN \__str_decode_utf_viii_continuation:wwN | |
\int_value:w \int_eval:n { `#1 - "C0 } \exp_after:wN | |
\fi: | |
\exp_after:wN % ??? | |
\fi: | |
\s__str | |
\__str_use_none_delimit_by_s_stop:w {"80} {"800} {"10000} {"110000} \s__str_stop | |
\__str_decode_utf_viii_start:N | |
} | |
\ExplSyntaxOff | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
% l3ptex | |
% | |
\ExplSyntaxOn | |
\cs_new_protected:Npn \__ptex_tmp:w { } | |
% (upTeX only) declare kanji token which has specific kcatcode. | |
\sys_if_engine_uptex:T | |
{ | |
\group_begin: | |
\cs_set:Npn \__ptex_tmp:w #1#2 | |
{ | |
\exp_after:wN \cs_new_eq:NN \exp_after:wN #1 | |
\tex_Ucharcat:D | |
\int_eval:n { \tex_jis:D "2121 } ~ | |
\int_eval:n {#2} \scan_stop: | |
} | |
\__ptex_tmp:w \c_kcatcode_kanji_token { 16 } | |
\__ptex_tmp:w \c_kcatcode_kana_token { 17 } | |
\__ptex_tmp:w \c_kcatcode_symbol_token { 18 } | |
\__ptex_tmp:w \c_kcatcode_hangul_token { 19 } | |
\group_end: | |
} | |
% (upTeX only) get kcatcode of token. | |
\sys_if_engine_uptex:T | |
{ | |
\cs_new:Npn \__ptex_char_kcatcode:N #1 | |
{ | |
\if_catcode:w \exp_not:N #1 \c_kcatcode_kanji_token | |
16 | |
\else: | |
\if_catcode:w \exp_not:N #1 \c_kcatcode_kana_token | |
17 | |
\else: | |
\if_catcode:w \exp_not:N #1 \c_kcatcode_symbol_token | |
18 | |
\else: | |
\if_catcode:w \exp_not:N #1 \c_kcatcode_hangul_token | |
19 | |
\else: | |
15 | |
\fi: | |
\fi: | |
\fi: | |
\fi: | |
} | |
} | |
% define ptex+utf8 encoding. | |
% ptex+utf8 encode function. | |
\cs_new_protected:cpn { __str_convert_encode_ptex+utf8: } | |
{ | |
\group_begin: | |
\cs_set_eq:NN \__str_encode_if_extended_char_p:n \__str_encode_if_kanji_char_p:n | |
\cs_set_eq:NN \__str_encode_extended_char:n \__str_encode_kanji_char:n | |
\use:c { __str_convert_encode_utf8: } | |
\group_end: | |
} | |
% Tests whether the Unicode code point #1 is Japanese character | |
\int_compare:nNnTF { \tex_jis:D "2121 } = { "3000 } | |
{ | |
% upTeX with unicode | |
\prg_new_conditional:Npnn \__str_encode_if_kanji_char:n #1 { p } | |
{ | |
% NOTE: | |
% \kcatcode0--\kcatcode127 may not be 15, | |
% but ASCII (0--127) never be Japanese character. | |
\if_int_compare:w #1 < "80 \exp_stop_f: | |
\prg_return_false: | |
\else: | |
\if_int_compare:w \tex_kcatcode:D #1 = 15 \exp_stop_f: | |
\prg_return_false: | |
\else: | |
\prg_return_true: | |
\fi: | |
\fi: | |
} | |
} | |
{ | |
% pTeX or upTeX with euc/sjis | |
\prg_new_conditional:Npnn \__str_encode_if_kanji_char:n #1 { p } | |
{ | |
% If specified Unicode code point cannot convert to internal Kanji code, | |
% \ucs returns -1. | |
\if_int_compare:w \tex_ucs:D #1 < 0 \exp_stop_f: | |
\prg_return_false: | |
\else: | |
\if_int_compare:w \tex_kcatcode:D \tex_ucs:D #1 = 15 \exp_stop_f: % upTeX only. pTeX always false. | |
\prg_return_false: | |
\else: | |
\prg_return_true: | |
\fi: | |
\fi: | |
} | |
} | |
% Generate Japanese character token which has Unicode code point #1. | |
\int_compare:nNnTF { \tex_jis:D "2121 } = { "3000 } | |
{ | |
% upTeX with unicode | |
\cs_new:Npn \__str_encode_kanji_char:n #1 | |
{ | |
% \Uchar always generates Latin character token for character | |
% code 128--255, but Japanese character token are needed, | |
% so use \Ucharcat instead. | |
\if_int_compare:w \tex_kcatcode:D #1 > 15 \exp_stop_f: | |
\tex_Ucharcat:D #1 \exp_stop_f: \tex_kcatcode:D #1 \exp_stop_f: | |
\else: | |
\tex_Ucharcat:D #1 \exp_stop_f: 18 \exp_stop_f: | |
\fi: | |
} | |
} | |
{ | |
% pTeX or upTeX with euc/sjis | |
\cs_new:Npn \__str_encode_kanji_char:n #1 | |
{ \tex_Uchar:D \tex_ucs:D #1 \exp_stop_f: \exp_stop_f: } | |
} | |
% ptex+utf8 decode function. | |
\cs_new_protected:cpn { __str_convert_decode_ptex+utf8: } | |
{ | |
\group_begin: | |
\cs_set_eq:NN \__str_decode_if_extended_char_p:N \__str_decode_if_kanji_char_p:N | |
\cs_set_eq:NN \__str_decode_extended_char:N \__str_decode_kanji_char:N | |
\use:c { __str_convert_decode_utf8: } | |
\group_end: | |
} | |
% Tests whether the token #1 is Japanese character. | |
\int_compare:nNnTF { \tex_jis:D "2121 } = { "3000 } | |
{ | |
% upTeX with unicode | |
\prg_new_conditional:Npnn \__str_decode_if_kanji_char:N #1 { p } | |
{ | |
\if_int_compare:w \__ptex_char_kcatcode:N #1 > 15 \exp_stop_f: | |
\prg_return_true: | |
\else: | |
\prg_return_false: | |
\fi: | |
} | |
} | |
{ | |
% pTeX or upTeX with euc/sjis | |
\prg_new_conditional:Npnn \__str_decode_if_kanji_char:N #1 { p } | |
{ | |
\if_int_compare:w `#1 < "100 \exp_stop_f: | |
\prg_return_false: | |
\else: | |
\prg_return_true: | |
\fi: | |
} | |
} | |
% Convert Japanese character #1 to Unicode code point. | |
\int_compare:nNnTF { \tex_jis:D "2121 } = { "3000 } | |
{ | |
% upTeX with unicode | |
\cs_new:Npn \__str_decode_kanji_char:N #1 | |
{ \int_value:w `#1 } | |
} | |
{ | |
% pTeX or upTeX with euc/sjis | |
\cs_new:Npn \__str_decode_kanji_char:N #1 | |
{ \tex_toucs:D `#1 } | |
} | |
% alternative encoding name | |
\prop_gput:Nnn \g__str_alias_prop { ptex } { ptex+utf8 } | |
% set default encoding to ptex+utf8 | |
\prop_gput:Nnn \g__str_alias_prop { default } { ptex+utf8 } | |
\ExplSyntaxOff | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
\ExplSyntaxOn | |
\tl_set:Nn \l_tmpa_tl {ABC~äëïöü~αβγ~あいうえお~日本語~☀☁☂☃} | |
\exp_args:NNV \str_set_convert:Nnnn \l_tmpa_str \l_tmpa_tl { default } { utf16/hex } | |
\iow_term:x { \l_tmpa_str } | |
% => FEFF004100420043002000E400EB00EF00F600FC002003B103B203B300203042304430463048304A002065E5672C8A9E00202600260126022603 | |
\tl_set:Nn \l_tmpa_tl { FEFF004100420043002000E400EB00EF00F600FC002003B103B203B300203042304430463048304A002065E5672C8A9E00202600260126022603 } | |
\exp_args:NNV \str_set_convert:Nnnn \l_tmpa_str \l_tmpa_tl { utf16/hex } { default } | |
\iow_term:x { \l_tmpa_str } | |
% => ABC ^^c3^^a4^^c3^^ab^^c3^^af^^c3^^b6^^c3^^bc αβγ あいうえお 日本語 ☀☁☂☃ | |
% => ABC ^^c3^^a4^^c3^^ab^^c3^^af^^c3^^b6^^c3^^bc αβγ あいうえお 日本語 ^^e2^^98^^80^^e2^^98^^81^^e2^^98^^82^^e2^^98^^83 | |
\tl_set:Nn \l_tmpa_tl {§¨°±´¶×÷} | |
\exp_args:NNV \str_set_convert:Nnnn \l_tmpa_str \l_tmpa_tl { default } { utf16/hex } | |
\iow_term:x { \l_tmpa_str } | |
% => FEFF00A700A800B000B100B400B600D700F7 | |
\tl_set:Nn \l_tmpa_tl { FEFF00A700A800B000B100B400B600D700F7 } | |
\exp_args:NNV \str_set_convert:Nnnn \l_tmpa_str \l_tmpa_tl { utf16/hex } { default } | |
\iow_term:x { \l_tmpa_str } | |
% => §¨°±´¶×÷ | |
\ExplSyntaxOff | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
\ifdefined\DocumentMetadata | |
% LaTeX2e 2022-06-01 | |
\DocumentMetadata | |
{ | |
backend = dvipdfmx | |
} | |
\else | |
% LaTeX2e 2021-11-15 | |
\RequirePackage{pdfmanagement-testphase} | |
\DeclareDocumentMetadata | |
{ | |
backend = dvipdfmx | |
} | |
\fi | |
\ExplSyntaxOn | |
% pdfmanagement-testphase override default to utf8. why? | |
% re-override default to ptex+utf8. | |
\prop_gput:Nnn \g__str_alias_prop { default } { ptex+utf8 } | |
\ExplSyntaxOff | |
\ifnum\jis"2121="3000 | |
% upLaTeX with unicode | |
\documentclass[dvipdfmx]{ujarticle} | |
\else | |
% pLaTeX or upLaTeX with euc/sjis | |
\documentclass[dvipdfmx]{jarticle} | |
\fi | |
\usepackage{hyperref} | |
\usepackage{pxjahyper} | |
\hypersetup{pdftitle = {ABC äëïöü αβγ あいうえお 日本語 ☀☁☂☃}} | |
\begin{document} | |
\section{はじめに} | |
あいうえお。 | |
\section{つぎに} | |
かきくけこ。 | |
\section{さいごに} | |
わをん。 | |
\end{document} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment