Skip to content

Instantly share code, notes, and snippets.

View kzinmr's full-sized avatar

Kazuki Inamura kzinmr

  • Tokyo, Japan
  • 15:59 (UTC +09:00)
View GitHub Profile
# 会社名サンプラー
import os
import pickle
import random
from dataclasses import dataclass
from typing import List
import numpy as np
def seed_everything(seed: int = 1234):
import torch
class WordDropout(torch.nn.Module):
"""
Implementation of word dropout. Randomly drops out entire words (or characters) in embedding space.
"""
def __init__(self, dropout_rate=0.05, inplace=False):
super(WordDropout, self).__init__()
self.dropout_rate = dropout_rate
@kzinmr
kzinmr / tokenize_and_aligned.py
Last active January 15, 2021 12:03
NER dataset processing with huggingface tokenizers==0.9.4 / transformers==4.2.1
import logging
import os
import unicodedata
from dataclasses import dataclass
from enum import Enum
from itertools import product, starmap
from pathlib import Path
from typing import Dict, List, Optional, Union
import MeCab
import os
import shutil
import sys
import zipfile
def copy_docs(zip_path, p):
try:
with zipfile.ZipFile(zip_path) as z:
targets = [fn for fn in z.namelist() if fn.endswith('doc') or fn.endswith('docx')]
@kzinmr
kzinmr / mecab_pretokenizer.py
Last active January 5, 2021 07:52
Fast/Slow tokenizers with huggingface/tokenizers
import unicodedata
from typing import Dict, List, Optional, Union
import MeCab
from tokenizers import (
AddedToken,
BertWordPieceTokenizer,
Encoding,
EncodeInput,
InputSequence,
)
"""
Conditional random field
"""
import logging
import math
from typing import Dict, List, Optional, Tuple, Union
import torch
logger = logging.getLogger(__name__)
import random
def get_random_unicode(length):
""" 以下表を参考に恣意的な文字範囲選択を行っている
http://www.rikai.com/library/kanjitables/kanji_codes.unicode.shtml
"""
try:
get_char = unichr
except NameError:
from dataclasses import dataclass
from typing import Dict, Iterable, List, Tuple
@dataclass
class ChunkSpan:
start: int
end: int
label: str
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
from bisect import bisect
from more_itertools import flatten
def merge_overlapping_spans(spans):
spans = map(list, spans)
spans = sorted(spans, key=lambda x: x[0])
merged = [spans[0]]
for current in spans:
previous = merged[-1]
if current[0] <= previous[1]: