Created
December 13, 2020 09:25
-
-
Save zyf0330/5874455087bc69708f07ff13d12890a5 to your computer and use it in GitHub Desktop.
转换搜狗拼音txt词库为 Gboard 微软双拼词库
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* dic.txt 内每行类似 'a'ba 阿巴 | |
* 先组合声母韵母,然后再完整替换 | |
* 对于零声母音节,声母当作空字符 | |
**/ | |
const fs = require('fs') | |
// 声母 | |
const shengmu = { | |
q: 'q', | |
w: 'w', | |
r: 'r', | |
t: 't', | |
y: 'y', | |
p: 'p', | |
s: 's', | |
d: 'd', | |
f: 'f', | |
g: 'g', | |
h: 'h', | |
j: 'j', | |
k: 'k', | |
l: 'l', | |
z: 'z', | |
x: 'x', | |
c: 'c', | |
b: 'b', | |
n: 'n', | |
m: 'm', | |
zh: 'v', | |
ch: 'i', | |
sh: 'u', | |
'': 'o', | |
} | |
// 韵母 | |
const yunmu = { | |
a: 'a', | |
o: 'o', | |
u: 'u', | |
e: 'e', | |
i: 'i', | |
v: 'y', | |
iang: 'd', | |
uang: 'd', | |
iong: 's', | |
iao: 'c', | |
ian: 'm', | |
ing: ';', | |
ang: 'h', | |
eng: 'g', | |
ong: 's', | |
uan: 'r', | |
uai: 'y', | |
ia: 'w', | |
ai: 'l', | |
ei: 'z', | |
ui: 'v', | |
ao: 'k', | |
ou: 'b', | |
iu: 'q', | |
ie: 'x', | |
ue: 't', | |
er: 'r', | |
an: 'j', | |
en: 'f', | |
in: 'n', | |
ua: 'w', | |
un: 'p', | |
uo: 'o', | |
} | |
const quanpins = [] | |
const shuangpins = [] | |
for (const [shengmuQuanpin, shengmuShuangpin] of Object.entries(shengmu)) { | |
for (const [yunmuQuanpin, yunmuShuangpin] of Object.entries(yunmu)) { | |
if (quanpins.includes(shengmuQuanpin + yunmuQuanpin)) { | |
throw new Error(`repeat: ${shengmuQuanpin} ${yunmuQuanpin}`) | |
} | |
quanpins.push(shengmuQuanpin + yunmuQuanpin) | |
shuangpins.push(shengmuShuangpin + yunmuShuangpin) | |
} | |
} | |
console.log('音节数', quanpins.length, shuangpins.length) | |
const quanpinDic = fs.readFileSync('dic.txt', 'utf8') | |
console.log('read from dic.txt') | |
const shuangpinDic = quanpinDic | |
.split('\n') | |
.map((line) => { | |
if (line.trim().length == 0) { | |
return '' | |
} | |
const [wordQuanpin, wordHanzi] = line.split(' ') | |
const wordShuangpin = wordQuanpin | |
.split("'") | |
.filter((s) => s != '') | |
.map((quanpin) => { | |
const i = quanpins.indexOf(quanpin) | |
if (i == -1) { | |
console.error(wordQuanpin, '有不存在的全拼音节', quanpin, 'length:', quanpin.length) | |
process.exit(1) | |
return quanpin | |
} else { | |
return shuangpins[i] | |
} | |
}) | |
.join('') | |
return `${wordShuangpin}\t${wordHanzi.trim()}\tzh-CN` | |
}) | |
.join('\n') | |
fs.writeFileSync('dictionary.txt', '# Gboard Dictionary version:1\n', {flag: 'w'}) | |
fs.writeFileSync('dictionary.txt', shuangpinDic, {flag: 'a'}) | |
console.log('output to dictionary.txt') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
转换好的 Gboard 词库