Skip to content

Instantly share code, notes, and snippets.

View kzinmr's full-sized avatar

Kazuki Inamura kzinmr

  • Tokyo, Japan
  • 01:58 (UTC +09:00)
View GitHub Profile
@kzinmr
kzinmr / extract_text_from_pdf.py
Last active July 4, 2019 09:26
extract text from pdf (exported from .ppt, .doc) using pdfminer(https://github.com/pdfminer/pdfminer.six)
import pickle
import subprocess
from pathlib import Path
import sys
from lxml import etree
def extract_textbox_slide(page):
"""
import contextlib
import time
@contextlib.contextmanager
def time_measure(ident):
tstart = time.time()
yield
elapsed = time.time() - tstart
print(f"{ident}: {elapsed} sec")
@kzinmr
kzinmr / remove_output.py
Created June 3, 2019 06:00 — forked from damianavila/remove_output.py
Remove output from IPython notebook from the command line (dev version 1.0)
"""
Usage: python remove_output.py notebook.ipynb [ > without_output.ipynb ]
Modified from remove_output by Minrk
"""
import sys
import io
import os
from IPython.nbformat.current import read, write
{
'NAME':
{
'Name_Other': {},
'Person': {},
'God': {},
'Organization':
{
'Organization_Other': {},
'International_Organization': {},
def katakana_to_hiragana(string):
result = ''
for character in string:
code = ord(character)
if ord('ァ') <= code <= ord('ヶ'):
result += chr(code - ord('ァ') + ord('ぁ'))
else:
result += character
return result
from collections import defaultdict
def dict_of_list(keys, values):
assert(len(keys) == len(values))
key2values = defaultdict(list)
for k, v in zip(keys, values):
key2values[k].append(v)
return key2values
@kzinmr
kzinmr / ngrams.py
Created December 14, 2018 03:02
pure python ngrams
from itertools import tee, zip_longest
def ngrams(iterable, n=3):
"""
>>> list(ngrams(range(5), 3))
[(0, 1, 2), (1, 2, 3), (2, 3, 4)]
"""
ts = tee(iterable, n)
for i, t in enumerate(ts[1:]):
import numpy as np
def calculate_cosine(query, X, k=10, threshold=0.5):
"""
Arguments:
query: (dim,) or (dim, 1)-array
X: (dim, N)-array of N-vectors
Returns:
topk_score_indices: sorted top-k-scored indices of N-vectors
topk_score: and its scores
from functools import reduce
def compose(*functions):
return reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x)
@kzinmr
kzinmr / transduce_date.py
Created November 1, 2018 05:26
日付の構造化
import re
import unicodedata
from transduce_jpnumber import kansuji_to_arabic_numeral
_date_re = re.compile('((?P<year>\d{4})[-/年](?P<ysuf>[^0-9\-/年月日\s]+)?)?((?P<month>\d{1,2})[-/月](?P<msuf>[^0-9\-/年月日\s]+)?)?((?P<date>\d{1,2})[日]?(?P<dsuf>[^0-9\-/年月日\s]+)?)?')
def transduce_jpdate(datestr):
datestr_n = kansuji_to_arabic_numeral(datestr)
datestr_n = unicodedata.normalize('NFKC', datestr_n)
m = _date_re.match(datestr_n.strip())
result = {}