kzinmr’s gists

kzinmr / jpx_companies.py

Created September 18, 2020 08:41

	import pandas as pd


	def jpx_companies():
	url = 'https://www.jpx.co.jp/markets/statistics-equities/misc/tvdivq0000001vg2-att/data_j.xls'
	df = pd.read_excel(url)
	# print(df.shape, df.columns)
	excludes = {'ETF・ETN', 'REIT・ベンチャーファンド・カントリーファンド・インフラファンド'}
	df_companies = df.loc[~df['市場・商品区分'].isin(excludes)]
	# companies_set = {c for c, df_ in df_companies.set_index('銘柄名').groupby(level=0)}

kzinmr / companies_with_lei.py

Created September 18, 2020 08:18

https://www.lei.jpx.co.jp/lei/full_file.html

	%%time

	import xmltodict
	import unicodedata
	import requests
	import datetime

	def download_leidata():
	dt = datetime.datetime.today().strftime('%Y%m%d')
	filestem = f'JPX-LEI-JPXFullFile_v2_{dt}'

kzinmr / xml2dict.py

Created September 18, 2020 06:52

	import xml.etree.ElementTree as ET


	class XmlListConfig(list):
	def __init__(self, aList):
	for element in aList:
	if element:
	# treat like dict
	if len(element) == 1 or element[0].tag != element[1].tag:
	self.append(XmlDictConfig(element))

kzinmr / address_matcher.py

Last active September 23, 2020 13:01

AddressMatcher for the Japanese addressing system, using https://geolonia.github.io/japanese-addresses/

	import regex as re
	import pandas as pd

	class AddressMatcher:

	def __init__(self):
	self.pref_city_addr_matcher = self.geolonia_dataset()

	@staticmethod
	def geolonia_dataset():

kzinmr / split_text_and_spans.py

Last active October 8, 2020 08:17

split_text_and_spans for NER

	def get_block_index_offset(index, step):
	assert step > 0
	offset = int(index % step)
	block_index = int((index - offset) / step)
	return block_index, offset

	def split_text_and_spans(text, spans, tagtypes, step):
	"""
	DLに噛ますのに最大入力帳単位に文書を切る。スパン情報も分割にあわせて修正する。
	その際アノテーションが2ブロックに渡るケースがあるが、

kzinmr / detect_correlation.py

Created August 3, 2020 10:16

	def detect_corr(df):
	c = df.corr(method='pearson').abs()
	s = c.unstack().apply(lambda x: 0 if x==1 else x)
	so = s.sort_values(kind="quicksort")[::-1]
	return so

kzinmr / ner_plot_confusion_matrix.py

Created July 7, 2020 10:42

	from sklearn.metrics import confusion_matrix
	from itertools import product
	import matplotlib.pyplot as plt

	from matplotlib.pyplot import figure
	figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')


	def plot_confusion_matrix(cm, classes,
	normalize=False,

kzinmr / df_fpfn.py

Created June 9, 2020 12:09

	def get_fp_examples(df, pred_col_name='score', gold_col_name='answer'):
	df_fp = df[(df[pred_col_name] == 1) & (df[gold_col_name] == False)]
	print(df_fp.shape)
	return df_fp

	def get_fn_examples(df, pred_col_name='score', gold_col_name='answer'):
	df_fn = df[(df[pred_col_name] == 0) & (df[gold_col_name] == True)]
	print(df_fn.shape)
	return df_fn

kzinmr / yomi_converter.py

Created June 3, 2020 12:44

kzinmr / fuzzy_match_spans.py

Created May 21, 2020 10:57

fuzzy matching using https://pypi.org/project/regex/

	import regex
	from typing import List, Tuple


	def fuzzy_match_spans(annotation: str, text: str, max_num_errors: int = 5) -> List[Tuple[int, int]]:
	r = regex.compile('(%s)' % re.escape(annotation))
	matches = [m.span() for m in r.finditer(text)]
	num_errors = 1
	while not matches and num_errors <= max_num_errors:
	r = regex.compile('(%s){e<=%d}' % (re.escape(annotation), num_errors))

Kazuki Inamura kzinmr