Last active
September 22, 2023 20:20
-
-
Save shuxiang/b228f71bc79820ad51769d46bf332d34 to your computer and use it in GitHub Desktop.
分析客家话与普通话读唐诗是否押韵与合辙
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding=utf8 | |
""" | |
git clone https://github.com/shuxiang/hakka-lang-bobai | |
git clone https://github.com/hxgdzyuyi/tang_poetry | |
mysql> source tang_poetry.sql | |
pip3 install flask | |
pip3 install xpinyin | |
pip3 install Flask-SQLALchemy | |
python3 poem_rhythm.py | |
""" | |
import warnings | |
warnings.filterwarnings("ignore") | |
import sys | |
import re | |
import json | |
import traceback | |
from flask import Flask | |
app = Flask(__name__) | |
app.config['SQLALCHEMY_DATABASE_URI'] = 'mysql+pymysql://user:[email protected]:3306/poem?charset=utf8mb4' | |
from flask_sqlalchemy import SQLAlchemy, BaseQuery | |
class DataBase(SQLAlchemy): | |
def init_app(self, app): | |
"""需要用到。要不sqlalchemy部分功能无法正常使用""" | |
self.app = app | |
super(DataBase, self).init_app(app) | |
db = DataBase() | |
class Model(db.Model): | |
__abstract__ = True | |
__table_args__ = { | |
'mysql_engine': 'InnoDB', | |
'mysql_charset': 'utf8mb4', | |
} | |
__dump_prop__ = [] | |
query_class = BaseQuery | |
db.Model = Model | |
db.init_app(app) | |
# 唐诗数据 https://github.com/hxgdzyuyi/tang_poetry | |
class Poetry(db.Model): | |
__tablename__ = 'poetries' | |
id = db.Column(db.Integer, primary_key=True, autoincrement=True) | |
poet_id = db.Column(db.Integer) | |
content = db.Column(db.Text) | |
title = db.Column(db.String(255)) | |
from functools import partial | |
# https://github.com/lxneng/xpinyin | |
from xpinyin import Pinyin | |
轉換聲母 = { | |
'm': 'm', 'n': 'n', 'ng': 'ŋ', 'nj': 'ɲ', | |
'b': 'p', 'd': 't', 'g': 'k', | |
'p': 'pʰ', 't': 'tʰ', 'k': 'kʰ', | |
's': 's', 'sh': 'ʃ', 'x': 'ɕ', | |
'z': 'ts', 'zh': 'tʃ', | |
'c': 'tsʰ', 'ch': 'tʃʰ', | |
'y': 'i', | |
'j': 'j', | |
'f': 'f', 'h': 'h', 'v': 'v', | |
'l': 'l', '': '' | |
} | |
def _拼音转讀音(yin, 轉換聲母): | |
讀音 = {} | |
if not yin[-1].isdigit(): | |
yin = yin+'1' | |
# 双字声母 | |
if yin[:2] in 轉換聲母.keys(): | |
if (len(yin) == 3 and yin[:2]=='ng'): | |
讀音['聲母'] = '' | |
else: | |
if yin[:3] == 'ngi': | |
讀音['聲母'] = 'nj' | |
else: | |
讀音['聲母'] = yin[:2] | |
# 单字声母 | |
elif yin[0] in 轉換聲母.keys(): | |
if len(yin) == 2 and (yin[0]=='m' or yin[0]=='n'): | |
讀音['聲母'] = '' | |
else: | |
讀音['聲母'] = yin[0] | |
else: | |
讀音['聲母'] = '' | |
if 讀音['聲母']: | |
if 讀音['聲母'] == 'nj' and len(yin)>4 and yin[:3]=='ngi': | |
if yin[:4] == 'ngin': | |
讀音['韻母'] = yin[len(讀音['聲母']):-1] | |
elif yin[:4] in ('ngit', 'ngik', 'ngip', 'ngim', 'ngin'): | |
讀音['韻母'] = yin[2:-1] | |
else: | |
讀音['韻母'] = yin[len(讀音['聲母'])+1:-1] | |
else: | |
讀音['韻母'] = yin[len(讀音['聲母']):-1] | |
else: | |
讀音['韻母'] = yin[:-1] | |
讀音['聲調'] = yin[-1] | |
return 讀音 | |
拼音转讀音 = partial(_拼音转讀音, 轉換聲母=轉換聲母) | |
py = Pinyin() | |
# 普通话数据 https://zhuanlan.zhihu.com/p/260660949 | |
元音 = ['a', 'o', 'e', 'ê', 'i', 'u', 'ü'] | |
# 客家话(博白)数据 https://github.com/shuxiang/hakka-lang-bobai | |
hakka = json.loads(open('../hakka-lang-bobai/hakka-lang-bobai.json', 'r').read()) | |
hakka_dict = {zi['zh-CN']:zi['ptk'] for zi in hakka if zi['zh-CN']} # 简体 | |
hakka_dict.update({zi['zh-hant']:zi['ptk'] for zi in hakka if zi['zh-hant']}) # 繁体 | |
# 统计诗总数 | |
rhythm_nums = 0 | |
# 普通话押韵数 | |
pth_nums = 0 | |
pth_tones_nums = 0 | |
# 客家话押韵数 | |
hakka_nums = 0 | |
hakka_tones_nums = 0 | |
# 都押韵的 | |
both_nums = 0 | |
both_tones_nums = 0 | |
# 判断两个句子句式平仄是否相对 | |
def is_sentence_match(tones, tones2, strict=True): | |
if strict: | |
if len(tones) <= 2: | |
# 平平仄仄平 - 仄仄平平仄 平平平仄仄 - 仄仄仄平平 | |
if tones[0] <= 2 and tones[1] > 2 and tones2[0] > 2 and tones2[1] <= 2: | |
return True | |
# 仄仄平平仄 - 平平仄仄平 仄仄仄平平 - 平平平仄仄 | |
if tones[0] > 2 and tones[1] <=2 and tones2[0] <= 2 and tones2[1] > 2: | |
return True | |
else: | |
# 平平仄仄平平仄 - 仄仄平平仄仄平 平平仄仄仄平平 - 仄仄平平平仄仄 | |
if tones[0] <= 2 and tones[1] > 2 and tones2[0] > 2 and tones2[1] <= 2 and tones[2] <= 2 and tones2[2] > 2: | |
return True | |
# 仄仄平平仄仄平 - 平平仄仄平平仄 仄仄平平平仄仄 - 平平仄仄仄平平 | |
if tones[0] > 2 and tones[1] <=2 and tones2[0] <= 2 and tones2[1] > 2 and tones[2] >2 and tones2[2] <= 2: | |
return True | |
else: | |
if len(tones) <= 2: | |
# 平平仄仄平 - 仄仄平平仄 平平平仄仄 - 仄仄仄平平 | |
if tones[0] <= 2 and tones2[0] > 2: | |
return True | |
# 仄仄平平仄 - 平平仄仄平 仄仄仄平平 - 平平平仄仄 | |
if tones[0] > 2 and tones[1] <=2: | |
return True | |
else: | |
# 平平仄仄平平仄 - 仄仄平平仄仄平 平平仄仄仄平平 - 仄仄平平平仄仄 | |
if tones[0] <= 2 and tones[1] > 2 and tones2[0] > 2 and tones2[1] <= 2: | |
return True | |
# 仄仄平平仄仄平 - 平平仄仄平平仄 仄仄平平平仄仄 - 平平仄仄仄平平 | |
if tones[0] > 2 and tones[1] <=2 and tones2[0] <= 2 and tones2[1] > 2: | |
return True | |
return False | |
with app.test_request_context(): | |
i = 0 | |
while True: | |
p_list = Poetry.query.offset(i*100).limit(100).all() | |
for j, poem in enumerate(p_list): | |
# 只判断格律诗 | |
ok = re.match(r'^([\u4e00-\u9fa5]{5}\,[\u4e00-\u9fa5]{5}\。){2}$', poem.content) \ | |
or re.match(r'^([\u4e00-\u9fa5]{5}\,[\u4e00-\u9fa5]{5}\。){4}$', poem.content) \ | |
or re.match(r'^([\u4e00-\u9fa5]{7}\,[\u4e00-\u9fa5]{7}\。){2}$', poem.content) \ | |
or re.match(r'^([\u4e00-\u9fa5]{7}\,[\u4e00-\u9fa5]{7}\。){4}$', poem.content) | |
if ok is not None: | |
try: | |
rhythm_nums += 1 | |
sentence = poem.content.split('。') | |
length = len(sentence) | |
# print(length) | |
# 韵字 | |
zi_list = [] | |
# 绝句 | |
if length == 3: | |
zi_list.append(sentence[0][-1]) | |
zi_list.append(sentence[1][-1]) | |
# 律诗 | |
elif length == 5: | |
zi_list.append(sentence[0][-1]) | |
zi_list.append(sentence[1][-1]) | |
zi_list.append(sentence[2][-1]) | |
zi_list.append(sentence[3][-1]) | |
else: | |
continue | |
# 获取韵字的普通话韵母 | |
yunmu_list = [] | |
for zi in zi_list: | |
try: | |
# 声母 | |
shengmu = py.get_initial(zi, with_retroflex=True).lower() | |
# 韵母 | |
yunmu = py.get_pinyin(zi, tone_marks=None).replace(shengmu, '') | |
# 处理韵母 iu, ui,v,去掉介音 | |
if yunmu == 'iu': | |
yunmu = 'ou' | |
elif yunmu == 'ui': | |
yunmu = 'ei' | |
elif yunmu == 'v': | |
yunmu = 'ü' | |
if len(yunmu)>1 and yunmu[0] in ('i', 'u', 'ü') and yunmu[1] in 元音: | |
yunmu = yunmu[1:] | |
yunmu_list.append(yunmu) | |
except:pass | |
print('\npth', yunmu_list, zi_list) | |
is_match_pth = False | |
# 是否同一个韵母 | |
if len(set(yunmu_list)) == 1 and len(yunmu_list)==len(zi_list): | |
pth_nums += 1 | |
print('match pth==>', yunmu_list[0]) | |
is_match_pth = True | |
# 获取韵字的客家话韵母 | |
yunmu_list2 = [] | |
for zi in zi_list: | |
try: | |
speed = hakka_dict.get(zi, '').split('/')[-1] | |
if speed: | |
yin = 拼音转讀音(speed) | |
yunmu = yin['韻母'] | |
# 处理韵母,去掉介音 | |
if len(yunmu)>1 and yunmu[0] in ('i', 'u') and yunmu[1] in 元音: | |
yunmu = yunmu[1:] | |
yunmu_list2.append(yunmu) | |
except:pass | |
print('hakka', yunmu_list2, zi_list) | |
is_match_hakka = False | |
# 是否同一个韵母 | |
if len(set(yunmu_list2)) == 1 and len(yunmu_list2)==len(zi_list): | |
hakka_nums += 1 | |
print('match hakka==>', yunmu_list2[0]) | |
is_match_hakka = True | |
# 都押韵 | |
if is_match_hakka and is_match_pth: | |
both_nums += 1 | |
# 判断是否平仄合辙 | |
# 获取声调 | |
pth_tones_list = [] | |
hakka_tones_list = [] | |
try: | |
sentence_list = [] | |
for s in sentence[:-1]: | |
sentence_list += s.split(',') | |
for k, short in enumerate(sentence_list): | |
#short = sent[:7] if len(sent) >= 7 else sent[:5]# 去掉符号 | |
pth_tones = [int(zi[-1]) for zi in py.get_pinyin(short, tone_marks='numbers').split('-') if zi[-1].isdigit()] # 'shang4-hai3' | |
hakka_tones = [int(拼音转讀音(hakka_dict.get(zi, '').split('/')[-1])['聲調']) for zi in short] | |
# 只判断正确的可比较的数据 | |
if len(pth_tones) == len(hakka_tones) and (len(pth_tones)==5 or len(pth_tones)==7): | |
pth_tones_list.append(pth_tones) | |
hakka_tones_list.append(hakka_tones) | |
except: | |
#print(short) | |
#traceback.print_exc() | |
pass | |
#print(pth_tones_list, length, len(pth_tones_list)) | |
#print(hakka_tones_list, length, len(hakka_tones_list)) | |
# 判断句式 | |
is_match_pth_tone = [] | |
is_match_hakka_tone = [] | |
if (length == 3 and len(pth_tones_list)==4) or (length==5 and len(pth_tones_list)==8): | |
for m in range(0, len(pth_tones_list), 2): | |
# 一三五不论,二四六分明 | |
d = pth_tones_list[m] | |
d2 = pth_tones_list[m+1] | |
t = hakka_tones_list[m] | |
t2 = hakka_tones_list[m+1] | |
pth = [d[1],d[3]] if length == 5 else [d[1],d[3],d[5]] | |
hakka = [t[1],t[3]] if length == 5 else [t[1],t[3],t[5]] | |
pth2 = [d2[1],d2[3]] if length == 5 else [d2[1],d2[3],d2[5]] | |
hakka2 = [t2[1],t2[3]] if length == 5 else [t2[1],t2[3],t2[5]] | |
#print('pt', pth, pth2, is_sentence_match(pth, pth2, strict=False)) | |
if not is_sentence_match(pth, pth2): | |
is_match_pth_tone.append(False) | |
else: | |
is_match_pth_tone.append(True) | |
#print('hk', hakka, hakka2, is_sentence_match(hakka, hakka2, strict=False)) | |
if not is_sentence_match(hakka, hakka2): | |
is_match_hakka_tone.append(False) | |
else: | |
is_match_hakka_tone.append(True) | |
#print(is_match_pth_tone) | |
#print(is_match_hakka_tone) | |
if set(is_match_pth_tone) == set([True]): | |
#print(pth_tones_list) | |
pth_tones_nums += 1 | |
if set(is_match_hakka_tone)==set([True]): | |
#print(hakka_tones_list) | |
hakka_tones_nums += 1 | |
if set(is_match_pth_tone) == set([True]) and set(is_match_hakka_tone)==set([True]): | |
both_tones_nums += 1 | |
except: | |
#traceback.print_exc() | |
#print(poem.content) | |
pass | |
#break | |
i += 1 | |
if not p_list: | |
break | |
print('诗总数: ', rhythm_nums) | |
print('普通话押韵数:', pth_nums) | |
print('普通话合辙数:', pth_tones_nums) | |
print('客家话押韵数:', hakka_nums) | |
print('客家话合辙数:', hakka_tones_nums) | |
print('都平仄合辙数:', both_tones_nums) | |
# # strict | |
# 诗总数: 28850 | |
# 普通话押韵数: 12901 | |
# 普通话合辙数: 5196 | |
# 客家话押韵数: 15314 | |
# 客家话合辙数: 8278 | |
# 都平仄合辙数: 2591 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment