This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def ordered_uniq(seq): | |
seen = set() | |
seen_add = seen.add | |
return [x for x in seq if x not in seen and not seen_add(x)] | |
def ordered_uniq_unhashable(seq): | |
seq_s = [str(l) for l in seq] | |
return [eval(s) for s in ordered_uniq(seq_s)] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# char-ngram jaccard sim | |
from nltk import ngrams | |
from nltk.metrics import jaccard_distance | |
def ngram(sequence_list, n=3): | |
return [''.join(grams) for i in range(1, n) for grams in ngrams(sequence_list, i)] | |
def jaccard_similarity(s1, s2): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
import numpy as np | |
# x: array of 1-d scalars | |
print(f'{np.mean(x):.03}, {np.std(x):.03}, {np.median(x)}') | |
plt.hist(x, bins=100) | |
plt.axvline(x.mean(), color='r') | |
plt.axvline(np.median(x), color='y') | |
plt.show() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from itertools import chain | |
def flatten(l): | |
return list(chain.from_iterable(l)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
dict_merge = lambda a,b: a.update(b) or a | |
dict_merge({'a':1, 'b':3},{'c':5}) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def chunks(l, n): | |
"""Yield successive n-sized chunks from l.""" | |
for i in range(0, len(l), n): | |
yield l[i:i + n] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
def round_float(n, precision=3): | |
factor = 10**precision | |
return math.ceil(v * factor) / factor) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
tt_ksuji = str.maketrans('一二三四五六七八九〇壱弐参', '1234567890123') | |
re_suji = re.compile(r'[十拾百千万億兆\d]+') | |
re_kunit = re.compile(r'[十拾百千]|\d+') | |
re_manshin = re.compile(r'[万億兆]|[^万億兆]+') | |
TRANSUNIT = {'十': 10, | |
'拾': 10, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import unicodedata | |
_address_re = re.compile(r'(京都府|東京都|.+?[道府県])?(.+郡)?(.+?[市町村])?(.+?区)?(.*)') | |
def transduce_jpaddress(addrstr): | |
addrstr_n = kansuji_to_arabic_numeral(addrstr) | |
m = _address_re.match(addrstr_n) | |
if m is not None: | |
(prefecture, county, city, ward, address) = m.groups() | |
address = address.strip() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import unicodedata | |
from transduce_jpnumber import kansuji_to_arabic_numeral | |
_date_re = re.compile('((?P<year>\d{4})[-/年](?P<ysuf>[^0-9\-/年月日\s]+)?)?((?P<month>\d{1,2})[-/月](?P<msuf>[^0-9\-/年月日\s]+)?)?((?P<date>\d{1,2})[日]?(?P<dsuf>[^0-9\-/年月日\s]+)?)?') | |
def transduce_jpdate(datestr): | |
datestr_n = kansuji_to_arabic_numeral(datestr) | |
datestr_n = unicodedata.normalize('NFKC', datestr_n) | |
m = _date_re.match(datestr_n.strip()) | |
result = {} |
OlderNewer