Skip to content

Instantly share code, notes, and snippets.

@shuxiang
Last active September 22, 2023 20:20
Show Gist options
  • Save shuxiang/b228f71bc79820ad51769d46bf332d34 to your computer and use it in GitHub Desktop.
Save shuxiang/b228f71bc79820ad51769d46bf332d34 to your computer and use it in GitHub Desktop.
分析客家话与普通话读唐诗是否押韵与合辙
#coding=utf8
"""
git clone https://github.com/shuxiang/hakka-lang-bobai
git clone https://github.com/hxgdzyuyi/tang_poetry
mysql> source tang_poetry.sql
pip3 install flask
pip3 install xpinyin
pip3 install Flask-SQLALchemy
python3 poem_rhythm.py
"""
import warnings
warnings.filterwarnings("ignore")
import sys
import re
import json
import traceback
from flask import Flask
app = Flask(__name__)
app.config['SQLALCHEMY_DATABASE_URI'] = 'mysql+pymysql://user:[email protected]:3306/poem?charset=utf8mb4'
from flask_sqlalchemy import SQLAlchemy, BaseQuery
class DataBase(SQLAlchemy):
def init_app(self, app):
"""需要用到。要不sqlalchemy部分功能无法正常使用"""
self.app = app
super(DataBase, self).init_app(app)
db = DataBase()
class Model(db.Model):
__abstract__ = True
__table_args__ = {
'mysql_engine': 'InnoDB',
'mysql_charset': 'utf8mb4',
}
__dump_prop__ = []
query_class = BaseQuery
db.Model = Model
db.init_app(app)
# 唐诗数据 https://github.com/hxgdzyuyi/tang_poetry
class Poetry(db.Model):
__tablename__ = 'poetries'
id = db.Column(db.Integer, primary_key=True, autoincrement=True)
poet_id = db.Column(db.Integer)
content = db.Column(db.Text)
title = db.Column(db.String(255))
from functools import partial
# https://github.com/lxneng/xpinyin
from xpinyin import Pinyin
轉換聲母 = {
'm': 'm', 'n': 'n', 'ng': 'ŋ', 'nj': 'ɲ',
'b': 'p', 'd': 't', 'g': 'k',
'p': 'pʰ', 't': 'tʰ', 'k': 'kʰ',
's': 's', 'sh': 'ʃ', 'x': 'ɕ',
'z': 'ts', 'zh': 'tʃ',
'c': 'tsʰ', 'ch': 'tʃʰ',
'y': 'i',
'j': 'j',
'f': 'f', 'h': 'h', 'v': 'v',
'l': 'l', '': ''
}
def _拼音转讀音(yin, 轉換聲母):
讀音 = {}
if not yin[-1].isdigit():
yin = yin+'1'
# 双字声母
if yin[:2] in 轉換聲母.keys():
if (len(yin) == 3 and yin[:2]=='ng'):
讀音['聲母'] = ''
else:
if yin[:3] == 'ngi':
讀音['聲母'] = 'nj'
else:
讀音['聲母'] = yin[:2]
# 单字声母
elif yin[0] in 轉換聲母.keys():
if len(yin) == 2 and (yin[0]=='m' or yin[0]=='n'):
讀音['聲母'] = ''
else:
讀音['聲母'] = yin[0]
else:
讀音['聲母'] = ''
if 讀音['聲母']:
if 讀音['聲母'] == 'nj' and len(yin)>4 and yin[:3]=='ngi':
if yin[:4] == 'ngin':
讀音['韻母'] = yin[len(讀音['聲母']):-1]
elif yin[:4] in ('ngit', 'ngik', 'ngip', 'ngim', 'ngin'):
讀音['韻母'] = yin[2:-1]
else:
讀音['韻母'] = yin[len(讀音['聲母'])+1:-1]
else:
讀音['韻母'] = yin[len(讀音['聲母']):-1]
else:
讀音['韻母'] = yin[:-1]
讀音['聲調'] = yin[-1]
return 讀音
拼音转讀音 = partial(_拼音转讀音, 轉換聲母=轉換聲母)
py = Pinyin()
# 普通话数据 https://zhuanlan.zhihu.com/p/260660949
元音 = ['a', 'o', 'e', 'ê', 'i', 'u', 'ü']
# 客家话(博白)数据 https://github.com/shuxiang/hakka-lang-bobai
hakka = json.loads(open('../hakka-lang-bobai/hakka-lang-bobai.json', 'r').read())
hakka_dict = {zi['zh-CN']:zi['ptk'] for zi in hakka if zi['zh-CN']} # 简体
hakka_dict.update({zi['zh-hant']:zi['ptk'] for zi in hakka if zi['zh-hant']}) # 繁体
# 统计诗总数
rhythm_nums = 0
# 普通话押韵数
pth_nums = 0
pth_tones_nums = 0
# 客家话押韵数
hakka_nums = 0
hakka_tones_nums = 0
# 都押韵的
both_nums = 0
both_tones_nums = 0
# 判断两个句子句式平仄是否相对
def is_sentence_match(tones, tones2, strict=True):
if strict:
if len(tones) <= 2:
# 平平仄仄平 - 仄仄平平仄 平平平仄仄 - 仄仄仄平平
if tones[0] <= 2 and tones[1] > 2 and tones2[0] > 2 and tones2[1] <= 2:
return True
# 仄仄平平仄 - 平平仄仄平 仄仄仄平平 - 平平平仄仄
if tones[0] > 2 and tones[1] <=2 and tones2[0] <= 2 and tones2[1] > 2:
return True
else:
# 平平仄仄平平仄 - 仄仄平平仄仄平 平平仄仄仄平平 - 仄仄平平平仄仄
if tones[0] <= 2 and tones[1] > 2 and tones2[0] > 2 and tones2[1] <= 2 and tones[2] <= 2 and tones2[2] > 2:
return True
# 仄仄平平仄仄平 - 平平仄仄平平仄 仄仄平平平仄仄 - 平平仄仄仄平平
if tones[0] > 2 and tones[1] <=2 and tones2[0] <= 2 and tones2[1] > 2 and tones[2] >2 and tones2[2] <= 2:
return True
else:
if len(tones) <= 2:
# 平平仄仄平 - 仄仄平平仄 平平平仄仄 - 仄仄仄平平
if tones[0] <= 2 and tones2[0] > 2:
return True
# 仄仄平平仄 - 平平仄仄平 仄仄仄平平 - 平平平仄仄
if tones[0] > 2 and tones[1] <=2:
return True
else:
# 平平仄仄平平仄 - 仄仄平平仄仄平 平平仄仄仄平平 - 仄仄平平平仄仄
if tones[0] <= 2 and tones[1] > 2 and tones2[0] > 2 and tones2[1] <= 2:
return True
# 仄仄平平仄仄平 - 平平仄仄平平仄 仄仄平平平仄仄 - 平平仄仄仄平平
if tones[0] > 2 and tones[1] <=2 and tones2[0] <= 2 and tones2[1] > 2:
return True
return False
with app.test_request_context():
i = 0
while True:
p_list = Poetry.query.offset(i*100).limit(100).all()
for j, poem in enumerate(p_list):
# 只判断格律诗
ok = re.match(r'^([\u4e00-\u9fa5]{5}\,[\u4e00-\u9fa5]{5}\。){2}$', poem.content) \
or re.match(r'^([\u4e00-\u9fa5]{5}\,[\u4e00-\u9fa5]{5}\。){4}$', poem.content) \
or re.match(r'^([\u4e00-\u9fa5]{7}\,[\u4e00-\u9fa5]{7}\。){2}$', poem.content) \
or re.match(r'^([\u4e00-\u9fa5]{7}\,[\u4e00-\u9fa5]{7}\。){4}$', poem.content)
if ok is not None:
try:
rhythm_nums += 1
sentence = poem.content.split('。')
length = len(sentence)
# print(length)
# 韵字
zi_list = []
# 绝句
if length == 3:
zi_list.append(sentence[0][-1])
zi_list.append(sentence[1][-1])
# 律诗
elif length == 5:
zi_list.append(sentence[0][-1])
zi_list.append(sentence[1][-1])
zi_list.append(sentence[2][-1])
zi_list.append(sentence[3][-1])
else:
continue
# 获取韵字的普通话韵母
yunmu_list = []
for zi in zi_list:
try:
# 声母
shengmu = py.get_initial(zi, with_retroflex=True).lower()
# 韵母
yunmu = py.get_pinyin(zi, tone_marks=None).replace(shengmu, '')
# 处理韵母 iu, ui,v,去掉介音
if yunmu == 'iu':
yunmu = 'ou'
elif yunmu == 'ui':
yunmu = 'ei'
elif yunmu == 'v':
yunmu = 'ü'
if len(yunmu)>1 and yunmu[0] in ('i', 'u', 'ü') and yunmu[1] in 元音:
yunmu = yunmu[1:]
yunmu_list.append(yunmu)
except:pass
print('\npth', yunmu_list, zi_list)
is_match_pth = False
# 是否同一个韵母
if len(set(yunmu_list)) == 1 and len(yunmu_list)==len(zi_list):
pth_nums += 1
print('match pth==>', yunmu_list[0])
is_match_pth = True
# 获取韵字的客家话韵母
yunmu_list2 = []
for zi in zi_list:
try:
speed = hakka_dict.get(zi, '').split('/')[-1]
if speed:
yin = 拼音转讀音(speed)
yunmu = yin['韻母']
# 处理韵母,去掉介音
if len(yunmu)>1 and yunmu[0] in ('i', 'u') and yunmu[1] in 元音:
yunmu = yunmu[1:]
yunmu_list2.append(yunmu)
except:pass
print('hakka', yunmu_list2, zi_list)
is_match_hakka = False
# 是否同一个韵母
if len(set(yunmu_list2)) == 1 and len(yunmu_list2)==len(zi_list):
hakka_nums += 1
print('match hakka==>', yunmu_list2[0])
is_match_hakka = True
# 都押韵
if is_match_hakka and is_match_pth:
both_nums += 1
# 判断是否平仄合辙
# 获取声调
pth_tones_list = []
hakka_tones_list = []
try:
sentence_list = []
for s in sentence[:-1]:
sentence_list += s.split(',')
for k, short in enumerate(sentence_list):
#short = sent[:7] if len(sent) >= 7 else sent[:5]# 去掉符号
pth_tones = [int(zi[-1]) for zi in py.get_pinyin(short, tone_marks='numbers').split('-') if zi[-1].isdigit()] # 'shang4-hai3'
hakka_tones = [int(拼音转讀音(hakka_dict.get(zi, '').split('/')[-1])['聲調']) for zi in short]
# 只判断正确的可比较的数据
if len(pth_tones) == len(hakka_tones) and (len(pth_tones)==5 or len(pth_tones)==7):
pth_tones_list.append(pth_tones)
hakka_tones_list.append(hakka_tones)
except:
#print(short)
#traceback.print_exc()
pass
#print(pth_tones_list, length, len(pth_tones_list))
#print(hakka_tones_list, length, len(hakka_tones_list))
# 判断句式
is_match_pth_tone = []
is_match_hakka_tone = []
if (length == 3 and len(pth_tones_list)==4) or (length==5 and len(pth_tones_list)==8):
for m in range(0, len(pth_tones_list), 2):
# 一三五不论,二四六分明
d = pth_tones_list[m]
d2 = pth_tones_list[m+1]
t = hakka_tones_list[m]
t2 = hakka_tones_list[m+1]
pth = [d[1],d[3]] if length == 5 else [d[1],d[3],d[5]]
hakka = [t[1],t[3]] if length == 5 else [t[1],t[3],t[5]]
pth2 = [d2[1],d2[3]] if length == 5 else [d2[1],d2[3],d2[5]]
hakka2 = [t2[1],t2[3]] if length == 5 else [t2[1],t2[3],t2[5]]
#print('pt', pth, pth2, is_sentence_match(pth, pth2, strict=False))
if not is_sentence_match(pth, pth2):
is_match_pth_tone.append(False)
else:
is_match_pth_tone.append(True)
#print('hk', hakka, hakka2, is_sentence_match(hakka, hakka2, strict=False))
if not is_sentence_match(hakka, hakka2):
is_match_hakka_tone.append(False)
else:
is_match_hakka_tone.append(True)
#print(is_match_pth_tone)
#print(is_match_hakka_tone)
if set(is_match_pth_tone) == set([True]):
#print(pth_tones_list)
pth_tones_nums += 1
if set(is_match_hakka_tone)==set([True]):
#print(hakka_tones_list)
hakka_tones_nums += 1
if set(is_match_pth_tone) == set([True]) and set(is_match_hakka_tone)==set([True]):
both_tones_nums += 1
except:
#traceback.print_exc()
#print(poem.content)
pass
#break
i += 1
if not p_list:
break
print('诗总数: ', rhythm_nums)
print('普通话押韵数:', pth_nums)
print('普通话合辙数:', pth_tones_nums)
print('客家话押韵数:', hakka_nums)
print('客家话合辙数:', hakka_tones_nums)
print('都平仄合辙数:', both_tones_nums)
# # strict
# 诗总数: 28850
# 普通话押韵数: 12901
# 普通话合辙数: 5196
# 客家话押韵数: 15314
# 客家话合辙数: 8278
# 都平仄合辙数: 2591
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment