This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 会社名サンプラー | |
import os | |
import pickle | |
import random | |
from dataclasses import dataclass | |
from typing import List | |
import numpy as np | |
def seed_everything(seed: int = 1234): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
class WordDropout(torch.nn.Module): | |
""" | |
Implementation of word dropout. Randomly drops out entire words (or characters) in embedding space. | |
""" | |
def __init__(self, dropout_rate=0.05, inplace=False): | |
super(WordDropout, self).__init__() | |
self.dropout_rate = dropout_rate |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
import os | |
import unicodedata | |
from dataclasses import dataclass | |
from enum import Enum | |
from itertools import product, starmap | |
from pathlib import Path | |
from typing import Dict, List, Optional, Union | |
import MeCab |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import shutil | |
import sys | |
import zipfile | |
def copy_docs(zip_path, p): | |
try: | |
with zipfile.ZipFile(zip_path) as z: | |
targets = [fn for fn in z.namelist() if fn.endswith('doc') or fn.endswith('docx')] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unicodedata | |
from typing import Dict, List, Optional, Union | |
import MeCab | |
from tokenizers import ( | |
AddedToken, | |
BertWordPieceTokenizer, | |
Encoding, | |
EncodeInput, | |
InputSequence, | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Conditional random field | |
""" | |
import logging | |
import math | |
from typing import Dict, List, Optional, Tuple, Union | |
import torch | |
logger = logging.getLogger(__name__) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
def get_random_unicode(length): | |
""" 以下表を参考に恣意的な文字範囲選択を行っている | |
http://www.rikai.com/library/kanjitables/kanji_codes.unicode.shtml | |
""" | |
try: | |
get_char = unichr | |
except NameError: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from dataclasses import dataclass | |
from typing import Dict, Iterable, List, Tuple | |
@dataclass | |
class ChunkSpan: | |
start: int | |
end: int | |
label: str |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bisect import bisect | |
from more_itertools import flatten | |
def merge_overlapping_spans(spans): | |
spans = map(list, spans) | |
spans = sorted(spans, key=lambda x: x[0]) | |
merged = [spans[0]] | |
for current in spans: | |
previous = merged[-1] | |
if current[0] <= previous[1]: |