Skip to content

Instantly share code, notes, and snippets.

View kzinmr's full-sized avatar

Kazuki Inamura kzinmr

  • Tokyo, Japan
  • 18:18 (UTC +09:00)
View GitHub Profile
import regex
class YomiConverter:
KANA2ROM = {
"ア": "a",
"イ": "i",
"ウ": "u",
"エ": "e",
def get_fp_examples(df, pred_col_name='score', gold_col_name='answer'):
df_fp = df[(df[pred_col_name] == 1) & (df[gold_col_name] == False)]
print(df_fp.shape)
return df_fp
def get_fn_examples(df, pred_col_name='score', gold_col_name='answer'):
df_fn = df[(df[pred_col_name] == 0) & (df[gold_col_name] == True)]
print(df_fn.shape)
return df_fn
from sklearn.metrics import confusion_matrix
from itertools import product
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
def plot_confusion_matrix(cm, classes,
normalize=False,
def detect_corr(df):
c = df.corr(method='pearson').abs()
s = c.unstack().apply(lambda x: 0 if x==1 else x)
so = s.sort_values(kind="quicksort")[::-1]
return so
@kzinmr
kzinmr / split_text_and_spans.py
Last active October 8, 2020 08:17
split_text_and_spans for NER
def get_block_index_offset(index, step):
assert step > 0
offset = int(index % step)
block_index = int((index - offset) / step)
return block_index, offset
def split_text_and_spans(text, spans, tagtypes, step):
"""
DLに噛ますのに最大入力帳単位に文書を切る。スパン情報も分割にあわせて修正する。
その際アノテーションが2ブロックに渡るケースがあるが、
@kzinmr
kzinmr / address_matcher.py
Last active September 23, 2020 13:01
AddressMatcher for the Japanese addressing system, using https://geolonia.github.io/japanese-addresses/
import regex as re
import pandas as pd
class AddressMatcher:
def __init__(self):
self.pref_city_addr_matcher = self.geolonia_dataset()
@staticmethod
def geolonia_dataset():
import xml.etree.ElementTree as ET
class XmlListConfig(list):
def __init__(self, aList):
for element in aList:
if element:
# treat like dict
if len(element) == 1 or element[0].tag != element[1].tag:
self.append(XmlDictConfig(element))
%%time
import xmltodict
import unicodedata
import requests
import datetime
def download_leidata():
dt = datetime.datetime.today().strftime('%Y%m%d')
filestem = f'JPX-LEI-JPXFullFile_v2_{dt}'
import pandas as pd
def jpx_companies():
url = 'https://www.jpx.co.jp/markets/statistics-equities/misc/tvdivq0000001vg2-att/data_j.xls'
df = pd.read_excel(url)
# print(df.shape, df.columns)
excludes = {'ETF・ETN', 'REIT・ベンチャーファンド・カントリーファンド・インフラファンド'}
df_companies = df.loc[~df['市場・商品区分'].isin(excludes)]
# companies_set = {c for c, df_ in df_companies.set_index('銘柄名').groupby(level=0)}
from bisect import bisect
from more_itertools import flatten
def merge_overlapping_spans(spans):
spans = map(list, spans)
spans = sorted(spans, key=lambda x: x[0])
merged = [spans[0]]
for current in spans:
previous = merged[-1]
if current[0] <= previous[1]: