Skip to content

Instantly share code, notes, and snippets.

View kzinmr's full-sized avatar

Kazuki Inamura kzinmr

  • Tokyo, Japan
  • 05:36 (UTC +09:00)
View GitHub Profile
import re
import unicodedata
_address_re = re.compile(r'(京都府|東京都|.+?[道府県])?(.+郡)?(.+?[市町村])?(.+?区)?(.*)')
def transduce_jpaddress(addrstr):
addrstr_n = kansuji_to_arabic_numeral(addrstr)
m = _address_re.match(addrstr_n)
if m is not None:
(prefecture, county, city, ward, address) = m.groups()
address = address.strip()
@kzinmr
kzinmr / transduce_jpnumber.py
Last active June 10, 2019 08:45
漢数字を数字に直す (revised from https://qiita.com/dosec/items/c6aef40fae6977fd89ab)
import re
tt_ksuji = str.maketrans('一二三四五六七八九〇壱弐参', '1234567890123')
re_suji = re.compile(r'[十拾百千万億兆\d]+')
re_kunit = re.compile(r'[十拾百千]|\d+')
re_manshin = re.compile(r'[万億兆]|[^万億兆]+')
TRANSUNIT = {'十': 10,
'拾': 10,
import math
def round_float(n, precision=3):
factor = 10**precision
return math.ceil(v * factor) / factor)
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i:i + n]
dict_merge = lambda a,b: a.update(b) or a
dict_merge({'a':1, 'b':3},{'c':5})
from itertools import chain
def flatten(l):
return list(chain.from_iterable(l))
import matplotlib.pyplot as plt
import numpy as np
# x: array of 1-d scalars
print(f'{np.mean(x):.03}, {np.std(x):.03}, {np.median(x)}')
plt.hist(x, bins=100)
plt.axvline(x.mean(), color='r')
plt.axvline(np.median(x), color='y')
plt.show()
# char-ngram jaccard sim
from nltk import ngrams
from nltk.metrics import jaccard_distance
def ngram(sequence_list, n=3):
return [''.join(grams) for i in range(1, n) for grams in ngrams(sequence_list, i)]
def jaccard_similarity(s1, s2):
def ordered_uniq(seq):
seen = set()
seen_add = seen.add
return [x for x in seq if x not in seen and not seen_add(x)]
def ordered_uniq_unhashable(seq):
seq_s = [str(l) for l in seq]
return [eval(s) for s in ordered_uniq(seq_s)]