Skip to content

Instantly share code, notes, and snippets.

View kzinmr's full-sized avatar

Kazuki Inamura kzinmr

  • Tokyo, Japan
  • 10:49 (UTC +09:00)
View GitHub Profile
def ordered_uniq(seq):
seen = set()
seen_add = seen.add
return [x for x in seq if x not in seen and not seen_add(x)]
def ordered_uniq_unhashable(seq):
seq_s = [str(l) for l in seq]
return [eval(s) for s in ordered_uniq(seq_s)]
# char-ngram jaccard sim
from nltk import ngrams
from nltk.metrics import jaccard_distance
def ngram(sequence_list, n=3):
return [''.join(grams) for i in range(1, n) for grams in ngrams(sequence_list, i)]
def jaccard_similarity(s1, s2):
import matplotlib.pyplot as plt
import numpy as np
# x: array of 1-d scalars
print(f'{np.mean(x):.03}, {np.std(x):.03}, {np.median(x)}')
plt.hist(x, bins=100)
plt.axvline(x.mean(), color='r')
plt.axvline(np.median(x), color='y')
plt.show()
from itertools import chain
def flatten(l):
return list(chain.from_iterable(l))
dict_merge = lambda a,b: a.update(b) or a
dict_merge({'a':1, 'b':3},{'c':5})
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i:i + n]
import math
def round_float(n, precision=3):
factor = 10**precision
return math.ceil(v * factor) / factor)
@kzinmr
kzinmr / transduce_jpnumber.py
Last active June 10, 2019 08:45
漢数字を数字に直す (revised from https://qiita.com/dosec/items/c6aef40fae6977fd89ab)
import re
tt_ksuji = str.maketrans('一二三四五六七八九〇壱弐参', '1234567890123')
re_suji = re.compile(r'[十拾百千万億兆\d]+')
re_kunit = re.compile(r'[十拾百千]|\d+')
re_manshin = re.compile(r'[万億兆]|[^万億兆]+')
TRANSUNIT = {'十': 10,
'拾': 10,
import re
import unicodedata
_address_re = re.compile(r'(京都府|東京都|.+?[道府県])?(.+郡)?(.+?[市町村])?(.+?区)?(.*)')
def transduce_jpaddress(addrstr):
addrstr_n = kansuji_to_arabic_numeral(addrstr)
m = _address_re.match(addrstr_n)
if m is not None:
(prefecture, county, city, ward, address) = m.groups()
address = address.strip()
@kzinmr
kzinmr / transduce_date.py
Created November 1, 2018 05:26
日付の構造化
import re
import unicodedata
from transduce_jpnumber import kansuji_to_arabic_numeral
_date_re = re.compile('((?P<year>\d{4})[-/年](?P<ysuf>[^0-9\-/年月日\s]+)?)?((?P<month>\d{1,2})[-/月](?P<msuf>[^0-9\-/年月日\s]+)?)?((?P<date>\d{1,2})[日]?(?P<dsuf>[^0-9\-/年月日\s]+)?)?')
def transduce_jpdate(datestr):
datestr_n = kansuji_to_arabic_numeral(datestr)
datestr_n = unicodedata.normalize('NFKC', datestr_n)
m = _date_re.match(datestr_n.strip())
result = {}