JVSコーパス内のtranscripts_utf8.txtとファイル名の不一致を修正するスクリプト。
- 対象:
jvs_ver1.zip- md5 hash:
2987778b0ee830914bfebb97783d0c3e
- md5 hash:
- 方法: JVSコーパスのディレクトリ以下で
fix.shを実行
JVSコーパス内のtranscripts_utf8.txtとファイル名の不一致を修正するスクリプト。
jvs_ver1.zip
2987778b0ee830914bfebb97783d0c3efix.shを実行| """ | |
| $ python modify_sudachi_json.py --user_dict path/to/user.dic --debug | |
| """ | |
| import argparse | |
| import errno | |
| import json | |
| import os | |
| import site | |
| from pathlib import Path |
| import json | |
| from http.server import HTTPServer, SimpleHTTPRequestHandler | |
| class CustomHandler(SimpleHTTPRequestHandler): | |
| def do_GET(self): | |
| SimpleHTTPRequestHandler.do_GET(self) | |
| print("===headers===\n", self.headers) | |
| print("===path===\n", self.path) | |
| def do_POST(self): |
| # Ref. http://www.cs.cmu.edu/~wcohen/10-605/notes/autodiff.pdf | |
| from collections import namedtuple | |
| Assignment = namedtuple("Item", ("z", "g", "y_list")) | |
| # f(x_1, x_2) = (2x_1 + x_2)^2 | |
| l = [Assignment(z="z1", g="add", y_list=["x1", "x1"]), | |
| Assignment(z="z2", g="add", y_list=["z1", "x2"]), | |
| Assignment(z="f", g="square", y_list=["z2"])] |
| # for Python 3.6 | |
| import unicodedata | |
| # from 0 through 1,114,111 (https://docs.python.org/3.6/library/functions.html#chr) | |
| for unicode_id in range(1114111): | |
| char = chr(unicode_id) | |
| normalized_char = unicodedata.normalize("NFKC", char) | |
| if char != normalized_char: | |
| if len(normalized_char) == 1: | |
| code_point = ord(normalized_char) |
| ImageHash 3.1 | |
| arrow 0.9.0 | |
| altair 1.2.0 | |
| tqdm 4.10.0 | |
| mlxtend 0.5.1.dev0 | |
| PyWavelets 0.5.0 | |
| pyLDAvis 2.0.0 | |
| TPOT 0.6.7 | |
| traitlets 4.3.1 | |
| funcy 1.7.2 |