BrikerMan · November 18, 2018 10:43
diff --git a/nlp_get_all_non_chinese_tokens.py b/nlp_get_all_non_chinese_tokens.py
 # encoding: utf-8
 """
 @author: BrikerMan
 @contact: [email protected]
 @blog: https://eliyar.biz

 @version: 1.0
 @license: Apache Licence
 @file: pre_process
 @time: 2018/11/18

 """
 import re
 import os
 import logging
 import pathlib
 from os import walk
 from os.path import splitext
 from os.path import join
 from typing import List
 import pandas as pd
 from tabulate import tabulate
 import shutil

 chinese_regex = re.compile('[\u4e00-\u9fa5]')


 TARGET_CHARS = ['，', '。', '】', '【', '、', '：', '“', '”', '；', '》', '《', '○', '）', '（', '？']


 def get_all_txt_files(path: str) -> List[str]:
    text_files = list()

    for root, dirs, files in walk(path):
        for f in files:
            if splitext(f)[1].lower() == ".txt":
                text_files.append(join(root, f))
    return text_files


 def get_all_non_chinese_tokens(path):
    tokens2count = {}
    for file in get_all_txt_files(path):
        tokens = chinese_regex.sub('', open(file, 'r', encoding='utf-8').read())
        for i in tokens:
            if i and i != ' ':
                tokens2count[i] = tokens2count.get(i, 0) + 1
    return tokens2count


 def get_mark2_count():
    path = '/Users/brikerman/Downloads/殆知阁古代文献藏书/易藏'
    t2c = get_all_non_chinese_tokens(path)
    data_list = []
    for k, v in t2c.items():
        data_list.append({
            'token': k,
            'count': v
        })
    df = pd.DataFrame(data_list)
    df = df[['token', 'count']]
    df = df.sort_values('count', ascending=False)
    print(df)

    df = df[df['count'] >= 7000]
    t = []
    for v in df.values:
        t.append(v[0])
    print(t)


 def get_file_info(path):
    fileinfo = []
    for file in get_all_txt_files(path):
        info = {
            'token_count': 0,
            'chinese_count': 0,
            'mark_count': 0,
            'mark_list': []
        }
        lines = open(file, 'r', encoding='utf-8').read().splitlines()
        for line in lines:
            line = line.strip()
            for char in line:
                if chinese_regex.match(char):
                    info['chinese_count'] += 1
                elif char in TARGET_CHARS:
                    info['mark_count'] += 1
                    info['mark_list'].append(char)

        info['token_count'] = info['chinese_count'] + info['mark_count']
        info['mark_list'] = ' '.join(set(info['mark_list']))
        info['mark_rate'] = info['mark_count'] / info['token_count']
        info['file'] = file.replace(path, '')
        fileinfo.append(info)
    df = pd.DataFrame(fileinfo)
    # df = df[]
    df.to_csv('file_info.csv')

    print(tabulate(df, headers='keys', tablefmt='psql'))
    return df


 def copy_files(data_path, target_path):
    marked_path = os.path.join(target_path, 'marked')
    unmarked_path = os.path.join(target_path, 'unmarked')
    pathlib.Path(marked_path).mkdir(parents=True, exist_ok=True)
    pathlib.Path(unmarked_path).mkdir(parents=True, exist_ok=True)

    df = get_file_info(data_path)
    marked_df = df[df['mark_rate'] >= 0.1]
    unmarked_df = df[df['mark_rate'] < 0.1]
    columns = list(df.columns)
    for file in marked_df.values:
        file_name = file[columns.index('file')]
        origin = data_path + file_name
        target = marked_path + file_name
        shutil.copy(origin, target)

    for file in unmarked_df.values:
        file_name = file[columns.index('file')]
        origin = data_path + file_name
        target = unmarked_path + file_name
        shutil.copy(origin, target)


 def format_line(text):
    """
    格式化一行数据
    :param text:
    :return:
    """
    text = text
    target_x = []
    target_label = []
    for char in text:
        if chinese_regex.match(char):
            target_x.append(char)
            target_label.append('O')
        elif char in TARGET_CHARS and len(target_label) > 0:
            target_label[-1] = char
    return target_x, target_label


 def format_all_file(path):
    """
    格式化数据
    :param path:
    :return:
    """
    data = []
    for file in get_all_txt_files(path):
        for line in open(file, 'r', encoding='utf-8').read().splitlines():
            line = line.strip()
            if line:
                x, y = format_line(line)
                if len(x) == len(y):
                    data.append({
                        'raw': line,
                        'x': x,
                        'y': y,
                        'length': len(x),
                    })
                else:
                    logging.error("格式化失败 {}".format(line))
    df = pd.DataFrame(data)
    df.to_csv(path + '/formatted.csv')


 if __name__ == '__main__':
    # 数据路径
    path = '/Users/brikerman/Downloads/殆知阁古代文献藏书/易藏'
    # 对文件进行分类，取出标注好的数据
    copy_files(path, path)
    # 对有标注的数据进行格式化
    format_all_file(path + '/marked')
	# encoding: utf-8
	"""
	@author: BrikerMan
	@contact: [email protected]
	@blog: https://eliyar.biz

	@version: 1.0
	@license: Apache Licence
	@file: pre_process
	@time: 2018/11/18

	"""
	import re
	import os
	import logging
	import pathlib
	from os import walk
	from os.path import splitext
	from os.path import join
	from typing import List
	import pandas as pd
	from tabulate import tabulate
	import shutil

	chinese_regex = re.compile('[\u4e00-\u9fa5]')


	TARGET_CHARS = ['，', '。', '】', '【', '、', '：', '“', '”', '；', '》', '《', '○', '）', '（', '？']


	def get_all_txt_files(path: str) -> List[str]:
	text_files = list()

	for root, dirs, files in walk(path):
	for f in files:
	if splitext(f)[1].lower() == ".txt":
	text_files.append(join(root, f))
	return text_files


	def get_all_non_chinese_tokens(path):
	tokens2count = {}
	for file in get_all_txt_files(path):
	tokens = chinese_regex.sub('', open(file, 'r', encoding='utf-8').read())
	for i in tokens:
	if i and i != ' ':
	tokens2count[i] = tokens2count.get(i, 0) + 1
	return tokens2count


	def get_mark2_count():
	path = '/Users/brikerman/Downloads/殆知阁古代文献藏书/易藏'
	t2c = get_all_non_chinese_tokens(path)
	data_list = []
	for k, v in t2c.items():
	data_list.append({
	'token': k,
	'count': v
	})
	df = pd.DataFrame(data_list)
	df = df[['token', 'count']]
	df = df.sort_values('count', ascending=False)
	print(df)

	df = df[df['count'] >= 7000]
	t = []
	for v in df.values:
	t.append(v[0])
	print(t)


	def get_file_info(path):
	fileinfo = []
	for file in get_all_txt_files(path):
	info = {
	'token_count': 0,
	'chinese_count': 0,
	'mark_count': 0,
	'mark_list': []
	}
	lines = open(file, 'r', encoding='utf-8').read().splitlines()
	for line in lines:
	line = line.strip()
	for char in line:
	if chinese_regex.match(char):
	info['chinese_count'] += 1
	elif char in TARGET_CHARS:
	info['mark_count'] += 1
	info['mark_list'].append(char)

	info['token_count'] = info['chinese_count'] + info['mark_count']
	info['mark_list'] = ' '.join(set(info['mark_list']))
	info['mark_rate'] = info['mark_count'] / info['token_count']
	info['file'] = file.replace(path, '')
	fileinfo.append(info)
	df = pd.DataFrame(fileinfo)
	# df = df[]
	df.to_csv('file_info.csv')

	print(tabulate(df, headers='keys', tablefmt='psql'))
	return df


	def copy_files(data_path, target_path):
	marked_path = os.path.join(target_path, 'marked')
	unmarked_path = os.path.join(target_path, 'unmarked')
	pathlib.Path(marked_path).mkdir(parents=True, exist_ok=True)
	pathlib.Path(unmarked_path).mkdir(parents=True, exist_ok=True)

	df = get_file_info(data_path)
	marked_df = df[df['mark_rate'] >= 0.1]
	unmarked_df = df[df['mark_rate'] < 0.1]
	columns = list(df.columns)
	for file in marked_df.values:
	file_name = file[columns.index('file')]
	origin = data_path + file_name
	target = marked_path + file_name
	shutil.copy(origin, target)

	for file in unmarked_df.values:
	file_name = file[columns.index('file')]
	origin = data_path + file_name
	target = unmarked_path + file_name
	shutil.copy(origin, target)


	def format_line(text):
	"""
	格式化一行数据
	:param text:
	:return:
	"""
	text = text
	target_x = []
	target_label = []
	for char in text:
	if chinese_regex.match(char):
	target_x.append(char)
	target_label.append('O')
	elif char in TARGET_CHARS and len(target_label) > 0:
	target_label[-1] = char
	return target_x, target_label


	def format_all_file(path):
	"""
	格式化数据
	:param path:
	:return:
	"""
	data = []
	for file in get_all_txt_files(path):
	for line in open(file, 'r', encoding='utf-8').read().splitlines():
	line = line.strip()
	if line:
	x, y = format_line(line)
	if len(x) == len(y):
	data.append({
	'raw': line,
	'x': x,
	'y': y,
	'length': len(x),
	})
	else:
	logging.error("格式化失败 {}".format(line))
	df = pd.DataFrame(data)
	df.to_csv(path + '/formatted.csv')


	if __name__ == '__main__':
	# 数据路径
	path = '/Users/brikerman/Downloads/殆知阁古代文献藏书/易藏'
	# 对文件进行分类，取出标注好的数据
	copy_files(path, path)
	# 对有标注的数据进行格式化
	format_all_file(path + '/marked')