This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
def jpx_companies(): | |
url = 'https://www.jpx.co.jp/markets/statistics-equities/misc/tvdivq0000001vg2-att/data_j.xls' | |
df = pd.read_excel(url) | |
# print(df.shape, df.columns) | |
excludes = {'ETF・ETN', 'REIT・ベンチャーファンド・カントリーファンド・インフラファンド'} | |
df_companies = df.loc[~df['市場・商品区分'].isin(excludes)] | |
# companies_set = {c for c, df_ in df_companies.set_index('銘柄名').groupby(level=0)} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%%time | |
import xmltodict | |
import unicodedata | |
import requests | |
import datetime | |
def download_leidata(): | |
dt = datetime.datetime.today().strftime('%Y%m%d') | |
filestem = f'JPX-LEI-JPXFullFile_v2_{dt}' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import xml.etree.ElementTree as ET | |
class XmlListConfig(list): | |
def __init__(self, aList): | |
for element in aList: | |
if element: | |
# treat like dict | |
if len(element) == 1 or element[0].tag != element[1].tag: | |
self.append(XmlDictConfig(element)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import regex as re | |
import pandas as pd | |
class AddressMatcher: | |
def __init__(self): | |
self.pref_city_addr_matcher = self.geolonia_dataset() | |
@staticmethod | |
def geolonia_dataset(): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_block_index_offset(index, step): | |
assert step > 0 | |
offset = int(index % step) | |
block_index = int((index - offset) / step) | |
return block_index, offset | |
def split_text_and_spans(text, spans, tagtypes, step): | |
""" | |
DLに噛ますのに最大入力帳単位に文書を切る。スパン情報も分割にあわせて修正する。 | |
その際アノテーションが2ブロックに渡るケースがあるが、 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def detect_corr(df): | |
c = df.corr(method='pearson').abs() | |
s = c.unstack().apply(lambda x: 0 if x==1 else x) | |
so = s.sort_values(kind="quicksort")[::-1] | |
return so |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.metrics import confusion_matrix | |
from itertools import product | |
import matplotlib.pyplot as plt | |
from matplotlib.pyplot import figure | |
figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k') | |
def plot_confusion_matrix(cm, classes, | |
normalize=False, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_fp_examples(df, pred_col_name='score', gold_col_name='answer'): | |
df_fp = df[(df[pred_col_name] == 1) & (df[gold_col_name] == False)] | |
print(df_fp.shape) | |
return df_fp | |
def get_fn_examples(df, pred_col_name='score', gold_col_name='answer'): | |
df_fn = df[(df[pred_col_name] == 0) & (df[gold_col_name] == True)] | |
print(df_fn.shape) | |
return df_fn |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import regex | |
class YomiConverter: | |
KANA2ROM = { | |
"ア": "a", | |
"イ": "i", | |
"ウ": "u", | |
"エ": "e", |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import regex | |
from typing import List, Tuple | |
def fuzzy_match_spans(annotation: str, text: str, max_num_errors: int = 5) -> List[Tuple[int, int]]: | |
r = regex.compile('(%s)' % re.escape(annotation)) | |
matches = [m.span() for m in r.finditer(text)] | |
num_errors = 1 | |
while not matches and num_errors <= max_num_errors: | |
r = regex.compile('(%s){e<=%d}' % (re.escape(annotation), num_errors)) |