Last active
December 16, 2019 19:51
-
-
Save bowbowbow/a037ff61c61c13947a56f6a2df52047d to your computer and use it in GitHub Desktop.
세종 코퍼스에서 형태소 분석 데이터 추출
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
from bs4 import BeautifulSoup | |
from tqdm import tqdm | |
import json | |
if __name__ == '__main__': | |
paths = glob.glob('./corpus-utf8/*.txt') | |
print('len(paths):', len(paths)) | |
pair_count = 0 | |
data = [] | |
for path in tqdm(paths): | |
with open(path, 'r') as f: | |
soup = BeautifulSoup(f, 'html.parser') | |
title = soup.select('title') | |
if '형태소' not in title[0].get_text(): | |
continue | |
texts = soup.select('text body p, text p') | |
if len(texts) == 0: | |
# 8CT_0042.txt 처럼 대화문은 text 태그 안에 내용이 있음 | |
texts = soup.select('text') | |
for text in texts: | |
item = [] | |
for line in text.get_text().split('\n'): | |
morpheme_pair = line.split('\t')[1:] | |
if len(morpheme_pair) < 2: | |
if len(item) > 0: | |
pair_count += len(item) | |
data.append(item) | |
item = [] | |
continue | |
item.append(morpheme_pair) | |
with open("./output.json", "w") as f: | |
json.dump(data, f, indent=2, sort_keys=True, ensure_ascii=False) | |
print('pair_count :', pair_count) | |
print('sent_count :', len(data)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
https://github.com/coolengineer/sejong-corpus 을 통해 세종 코퍼스 문서 다운 받은 뒤 추출 진행 함