Last active
November 18, 2018 10:43
-
-
Save BrikerMan/996bede6edc41bad070c1d4e07d4d58d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: utf-8 | |
""" | |
@author: BrikerMan | |
@contact: [email protected] | |
@blog: https://eliyar.biz | |
@version: 1.0 | |
@license: Apache Licence | |
@file: pre_process | |
@time: 2018/11/18 | |
""" | |
import re | |
import os | |
import logging | |
import pathlib | |
from os import walk | |
from os.path import splitext | |
from os.path import join | |
from typing import List | |
import pandas as pd | |
from tabulate import tabulate | |
import shutil | |
chinese_regex = re.compile('[\u4e00-\u9fa5]') | |
TARGET_CHARS = [',', '。', '】', '【', '、', ':', '“', '”', ';', '》', '《', '○', ')', '(', '?'] | |
def get_all_txt_files(path: str) -> List[str]: | |
text_files = list() | |
for root, dirs, files in walk(path): | |
for f in files: | |
if splitext(f)[1].lower() == ".txt": | |
text_files.append(join(root, f)) | |
return text_files | |
def get_all_non_chinese_tokens(path): | |
tokens2count = {} | |
for file in get_all_txt_files(path): | |
tokens = chinese_regex.sub('', open(file, 'r', encoding='utf-8').read()) | |
for i in tokens: | |
if i and i != ' ': | |
tokens2count[i] = tokens2count.get(i, 0) + 1 | |
return tokens2count | |
def get_mark2_count(): | |
path = '/Users/brikerman/Downloads/殆知阁古代文献藏书/易藏' | |
t2c = get_all_non_chinese_tokens(path) | |
data_list = [] | |
for k, v in t2c.items(): | |
data_list.append({ | |
'token': k, | |
'count': v | |
}) | |
df = pd.DataFrame(data_list) | |
df = df[['token', 'count']] | |
df = df.sort_values('count', ascending=False) | |
print(df) | |
df = df[df['count'] >= 7000] | |
t = [] | |
for v in df.values: | |
t.append(v[0]) | |
print(t) | |
def get_file_info(path): | |
fileinfo = [] | |
for file in get_all_txt_files(path): | |
info = { | |
'token_count': 0, | |
'chinese_count': 0, | |
'mark_count': 0, | |
'mark_list': [] | |
} | |
lines = open(file, 'r', encoding='utf-8').read().splitlines() | |
for line in lines: | |
line = line.strip() | |
for char in line: | |
if chinese_regex.match(char): | |
info['chinese_count'] += 1 | |
elif char in TARGET_CHARS: | |
info['mark_count'] += 1 | |
info['mark_list'].append(char) | |
info['token_count'] = info['chinese_count'] + info['mark_count'] | |
info['mark_list'] = ' '.join(set(info['mark_list'])) | |
info['mark_rate'] = info['mark_count'] / info['token_count'] | |
info['file'] = file.replace(path, '') | |
fileinfo.append(info) | |
df = pd.DataFrame(fileinfo) | |
# df = df[] | |
df.to_csv('file_info.csv') | |
print(tabulate(df, headers='keys', tablefmt='psql')) | |
return df | |
def copy_files(data_path, target_path): | |
marked_path = os.path.join(target_path, 'marked') | |
unmarked_path = os.path.join(target_path, 'unmarked') | |
pathlib.Path(marked_path).mkdir(parents=True, exist_ok=True) | |
pathlib.Path(unmarked_path).mkdir(parents=True, exist_ok=True) | |
df = get_file_info(data_path) | |
marked_df = df[df['mark_rate'] >= 0.1] | |
unmarked_df = df[df['mark_rate'] < 0.1] | |
columns = list(df.columns) | |
for file in marked_df.values: | |
file_name = file[columns.index('file')] | |
origin = data_path + file_name | |
target = marked_path + file_name | |
shutil.copy(origin, target) | |
for file in unmarked_df.values: | |
file_name = file[columns.index('file')] | |
origin = data_path + file_name | |
target = unmarked_path + file_name | |
shutil.copy(origin, target) | |
def format_line(text): | |
""" | |
格式化一行数据 | |
:param text: | |
:return: | |
""" | |
text = text | |
target_x = [] | |
target_label = [] | |
for char in text: | |
if chinese_regex.match(char): | |
target_x.append(char) | |
target_label.append('O') | |
elif char in TARGET_CHARS and len(target_label) > 0: | |
target_label[-1] = char | |
return target_x, target_label | |
def format_all_file(path): | |
""" | |
格式化数据 | |
:param path: | |
:return: | |
""" | |
data = [] | |
for file in get_all_txt_files(path): | |
for line in open(file, 'r', encoding='utf-8').read().splitlines(): | |
line = line.strip() | |
if line: | |
x, y = format_line(line) | |
if len(x) == len(y): | |
data.append({ | |
'raw': line, | |
'x': x, | |
'y': y, | |
'length': len(x), | |
}) | |
else: | |
logging.error("格式化失败 {}".format(line)) | |
df = pd.DataFrame(data) | |
df.to_csv(path + '/formatted.csv') | |
if __name__ == '__main__': | |
# 数据路径 | |
path = '/Users/brikerman/Downloads/殆知阁古代文献藏书/易藏' | |
# 对文件进行分类,取出标注好的数据 | |
copy_files(path, path) | |
# 对有标注的数据进行格式化 | |
format_all_file(path + '/marked') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment