Created
May 13, 2012 23:41
-
-
Save mjf/2690801 to your computer and use it in GitHub Desktop.
pinyin2utf8.sed -- Convert US-ASCII Pinyin to UTF-8
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /bin/sed -f | |
# pinyin2utf8.sed -- Convert US-ASCII Pinyin to UTF-8 | |
# Copyright (C) 2012 Matous J. Fialka (mjf), <http://mjf.cz/> | |
# Released under the terms of The MIT License | |
# | |
# DESCRIPTION | |
# Script converts all occurences of US-ASCII encoded Pinyin text | |
# enclosed by the solidus characters pairs to UTF-8 encoded text. | |
# | |
# USAGE | |
# pinyin2utf8.sed filename [ > filename.out ] | |
# | |
# SYNTAX | |
# The following groups of Pinyin elements can be transliterated: | |
# | |
# ----------------------------------------------------- | |
# ang1 eng1 ing1 ong1 an1 en1 in1 un1 ao1 ou1 ai1 ei1 | |
# ang2 eng2 ing2 ong2 an2 en2 in2 un2 ao2 ou2 ai2 ei2 | |
# ang3 eng3 ing3 ong3 an3 en3 in3 un3 ao3 ou3 ai3 ei3 | |
# ang4 eng4 ing4 ong4 an4 en4 in4 un4 ao4 ou4 ai4 ei4 | |
# | |
# a1 er2 lyue nyue e1 o1 i1 nyu3 lyu v1 u1 u:1 | |
# a2 er3 e1 o1 i1 v2 u2 u:2 | |
# a3 er4 e1 o1 i1 v3 u3 u:3 | |
# a4 e1 o1 i1 v4 u4 u:4 | |
# v0 u:0 | |
# ----------------------------------------------------- | |
# | |
# WARNINGS | |
# Script contains the ^A control character, usually displayed as | |
# mentioned in most text editors, that can be usually reproduced | |
# by pressing ^V ^A key sequence. The ^A control characters thus | |
# MUST NOT occure in the input stream. To find the sequences in | |
# the script lookup the y/// command in the code, please. | |
# | |
# In the US-ASCII encoded Pinyin to UTF-8 Pinyin conversion code | |
# special delimiting sequences of left and right parentheses are | |
# used and those two delimiting sequences of left or righ parens | |
# SHOULD NOT be used in the input stream. | |
# | |
# CHANGELOG | |
# v0.1 -- Initial version (-mjf) | |
# v0.2 -- Add support for /v[0-4]/ with meaning /u:[0-4]/ (-mjf) | |
: 0 | |
$! { | |
N | |
b 0 | |
} | |
/\//! q | |
# HERE BE DRAGONS | |
y/\n// | |
y/\//\ | |
/ | |
: a | |
h | |
s/[^\n]*\n// | |
s/\n.*// | |
# CONVERSION CODE BEGINNING | |
s/ang1/(((aq)))ng/g | |
s/ang2/(((aw)))ng/g | |
s/ang3/(((ae)))ng/g | |
s/ang4/(((ar)))ng/g | |
s/eng1/(((eq)))ng/g | |
s/eng2/(((ew)))ng/g | |
s/eng3/(((ee)))ng/g | |
s/eng4/(((er)))ng/g | |
s/ing1/(((iq)))ng/g | |
s/ing2/(((iw)))ng/g | |
s/ing3/(((ie)))ng/g | |
s/ing4/(((ir)))ng/g | |
s/ong1/(((oq)))ng/g | |
s/ong2/(((ow)))ng/g | |
s/ong3/(((oe)))ng/g | |
s/ong4/(((or)))ng/g | |
s/an1/(((aq)))n/g | |
s/an2/(((aw)))n/g | |
s/an3/(((ae)))n/g | |
s/an4/(((ar)))n/g | |
s/en1/(((eq)))n/g | |
s/en2/(((ew)))n/g | |
s/en3/(((ee)))n/g | |
s/en4/(((er)))n/g | |
s/in1/(((iq)))n/g | |
s/in2/(((iw)))n/g | |
s/in3/(((ie)))n/g | |
s/in4/(((ir)))n/g | |
s/un1/(((uq)))n/g | |
s/un2/(((uw)))n/g | |
s/un3/(((ue)))n/g | |
s/un4/(((ur)))n/g | |
s/ao1/(((aq)))o/g | |
s/ao2/(((aw)))o/g | |
s/ao3/(((ae)))o/g | |
s/ao4/(((ar)))o/g | |
s/ou1/(((oq)))u/g | |
s/ou2/(((ow)))u/g | |
s/ou3/(((oe)))u/g | |
s/ou4/(((or)))u/g | |
s/ai1/(((aq)))i/g | |
s/ai2/(((aw)))i/g | |
s/ai3/(((ae)))i/g | |
s/ai4/(((ar)))i/g | |
s/ei1/(((eq)))i/g | |
s/ei2/(((ew)))i/g | |
s/ei3/(((ee)))i/g | |
s/ei4/(((er)))i/g | |
s/a1/(((aq)))/g | |
s/a2/(((aw)))/g | |
s/a3/(((ae)))/g | |
s/a4/(((ar)))/g | |
s/er2/(((ew)))r/g | |
s/er3/(((ee)))r/g | |
s/er4/(((er)))r/g | |
s/lyue/l(((u:)))e/g | |
s/nyue/n(((u:)))e/g | |
s/e1/(((eq)))/g | |
s/e2/(((ew)))/g | |
s/e3/(((ee)))/g | |
s/e4/(((er)))/g | |
s/o1/(((oq)))/g | |
s/o2/(((ow)))/g | |
s/o3/(((oe)))/g | |
s/o4/(((or)))/g | |
s/i1/(((iq)))/g | |
s/i2/(((iw)))/g | |
s/i3/(((ie)))/g | |
s/i4/(((ir)))/g | |
s/nyu3/n(((u:e)))/g | |
s/lyu/l(((u:)))/g | |
s/v\([0-4]\)/u:\1/g | |
s/u:1/(((u:q)))/g | |
s/u:2/(((u:w)))/g | |
s/u:3/(((u:e)))/g | |
s/u:4/(((u:r)))/g | |
s/u:0/(((u:s)))/g | |
s/u1/(((uq)))/g | |
s/u2/(((uw)))/g | |
s/u3/(((ue)))/g | |
s/u4/(((ur)))/g | |
s/(((aq)))/ā/g | |
s/(((aw)))/á/g | |
s/(((ae)))/ǎ/g | |
s/(((ar)))/à/g | |
s/(((eq)))/ē/g | |
s/(((ew)))/é/g | |
s/(((ee)))/ě/g | |
s/(((er)))/è/g | |
s/(((iq)))/ī/g | |
s/(((iw)))/í/g | |
s/(((ie)))/ǐ/g | |
s/(((ir)))/ì/g | |
s/(((oq)))/ō/g | |
s/(((ow)))/ó/g | |
s/(((oe)))/ǒ/g | |
s/(((or)))/ò/g | |
s/(((uq)))/ū/g | |
s/(((uw)))/ú/g | |
s/(((ue)))/ǔ/g | |
s/(((ur)))/ù/g | |
s/(((u:q)))/ǖ/g | |
s/(((u:w)))/ǘ/g | |
s/(((u:e)))/ǚ/g | |
s/(((u:r)))/ǜ/g | |
s/(((u:s)))/ü/g | |
# CONVERSION CODE END | |
G | |
s/\([^\n]*\)\n\([^\n]*\)\n[^\n]*\n/\2\/\1\// | |
/\n/ b a | |
# HERE BE DRAGONS | |
y//\ | |
/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Warning
Scrips contains binary data. Raw view MUST be used to download. Copy & paste "method" MUST NOT be used to obtain this script! You have been warned.
Sample input text
Sample run
Another sample input
Another sample run