Skip to content

Instantly share code, notes, and snippets.

@ayaka14732
Last active July 24, 2024 16:21
Show Gist options
  • Save ayaka14732/d4527a028e5e96e54de67305eb2967f1 to your computer and use it in GitHub Desktop.
Save ayaka14732/d4527a028e5e96e54de67305eb2967f1 to your computer and use it in GitHub Desktop.
入声字情感分析:入声字的情感与舒声字无明显差别
/zh_en.linux
/char.csv
/词表.txt
/结果.txt

与舒声字相比,入声字表示的情感更负面吗?

wget https://github.com/Heptagon196/Dict/raw/master/dic/zh_en.linux
wget https://github.com/CanCLID/rime-cantonese-upstream/raw/d82d3e3e5fc3d39cc3ec67116385e6be5ec37b17/char.csv
python preprocess.py
python 入声字情感分析.py

结果:

舒声字平均分: 60.90
入声字平均分: 59.42

结果表明入声字的情感与舒声字无明显差别。

from collections import defaultdict
import re
d = defaultdict(list)
# https://ayaka.shn.hk/hanregex/zh-CN/
def has_han(s):
return bool(re.search(r'[\u3006\u3007\u4e00-\u9fff\u3400-\u4dbf\U00020000-\U0002a6df\U0002a700-\U0002ebef\U00030000-\U0003134f]', s))
# workaround for cases like 'AAAA (BBBB; CCCC); DDDD'
def convert_all_nested_semicolon_to_正(s):
last_s = s
while True:
s = re.sub(r'(\([^()]*);([^()]*\))', r'\1正\2', s)
if s == last_s:
break
last_s = s
return s
def convert_正_to_semicolon(s):
return s.replace('正', ';')
with open('zh_en.linux', encoding='utf-8') as f:
try:
while True:
word = next(f).rstrip('\n')
explanation = next(f).rstrip('\n')
if len(word) != 1:
continue
assert explanation[:2] == r'\n'
explanation = explanation[2:]
explanation = convert_all_nested_semicolon_to_正(explanation)
items = explanation.split('; ')
items = [convert_正_to_semicolon(item) for item in items]
for item in items:
if not has_han(item):
d[word].append(item)
except StopIteration:
pass
with open('词表.txt', 'w', encoding='utf-8') as f:
for k, vs in d.items():
vfs = [v for v in vs if v[0] != '(']
v = vfs[0] if vfs else vs[0]
assert '\t' not in v
print(k, v, sep='\t', file=f)
tqdm
ToJyutping==0.2.1
numpy==1.22.3
tensorflow==2.10.0
transformers==4.18.0
import ToJyutping
from tqdm import tqdm
from transformers import pipeline
import numpy as np
import re
classifier = pipeline('sentiment-analysis')
def handle_one_result(result):
if result['label'] == 'POSITIVE':
score = result['score'] * 100
assert score > 50.
else:
score = 100. - result['score'] * 100
assert score < 50.
return score
def 分块(lst, n):
return [lst[i:i+n] for i in range(0, len(lst), n)]
def 根据粤拼判断入声(粵拼: str) -> bool:
return bool(re.search('[ptk]\d$', 粵拼))
汉字_英文列表 = []
英文列表 = []
with open('词表.txt', encoding='utf-8') as f:
for line in f:
汉字, 英文 = line.rstrip('\n').split('\t')
汉字_英文列表.append((汉字, 英文))
英文列表.append(英文)
分块英文列表 = 分块(英文列表, 64)
情感分析结果列表 = []
for 英文块 in tqdm(分块英文列表):
情感分析结果块 = classifier(英文块)
情感分析结果块 = list(map(handle_one_result, 情感分析结果块))
情感分析结果列表.extend(情感分析结果块)
舒声字得分列表 = []
入声字得分列表 = []
with open('结果.txt', 'w', encoding='utf-8') as f:
for (汉字, 英文), 情感分析结果 in zip(汉字_英文列表, 情感分析结果列表):
粵拼 = ToJyutping.get_jyutping_text(汉字)
if not 粵拼:
continue # 不处理没有读音的情况
是入声字 = 根据粤拼判断入声(粵拼)
if not 是入声字:
舒声字得分列表.append(情感分析结果)
else:
入声字得分列表.append(情感分析结果)
print(汉字, 粵拼, '舒' if not 是入声字 else '入', 英文, 情感分析结果, sep='\t', file=f)
舒声字平均分 = np.array(舒声字得分列表).mean()
入声字平均分 = np.array(入声字得分列表).mean()
print(f'舒声字平均分:{舒声字平均分:.2f}')
print(f'入声字平均分:{入声字平均分:.2f}')
@ChouUn
Copy link

ChouUn commented Jul 24, 2024

非常好研究,让我 GPU 风扇旋转

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment