Skip to content

Instantly share code, notes, and snippets.

View kzinmr's full-sized avatar

Kazuki Inamura kzinmr

  • Tokyo, Japan
  • 10:51 (UTC +09:00)
View GitHub Profile
@kzinmr
kzinmr / extractdocx.py
Created July 16, 2019 03:52 — forked from etienned/extractdocx.py
Simple function to extract text from MS XML Word document (.docx) without any dependencies.
try:
from xml.etree.cElementTree import XML
except ImportError:
from xml.etree.ElementTree import XML
import zipfile
"""
Module that extract text from MS XML Word document (.docx).
(Inspired by python-docx <https://github.com/mikemaccana/python-docx>)
@kzinmr
kzinmr / pmi.py
Last active September 10, 2024 10:07
PMI calculation
"calculate PMI(A,B)=P(A,B)/P(A)P(B) for every token A and B in a window"
from itertools import tee, combinations
from collections import Counter
def count_bigram(sentence, window=5):
# ['A','B','C','D', 'E', 'F', 'G'], 4 ->
# [['A', 'B', 'C', 'D'],
# ['B', 'C', 'D', 'E'],
# ['C', 'D', 'E', 'F'],
from collections import defaultdict
from functools import reduce, partial
import numpy as np
from itertools import chain
def flatten(l):
return list(chain.from_iterable(l))
@kzinmr
kzinmr / cluster_df.py
Last active September 10, 2024 10:05
from collections import defaultdict, Counter
from operator import add
from functools import reduce
import numpy as np
from sklearn.cluster import KMeans
def dict_of_list(keys, values):
assert(len(keys) == len(values))
from collections import OrderedDict
from html.parser import HTMLParser
import json
class MyHTMLParser(HTMLParser):
debug = False
def __init__(self):
@kzinmr
kzinmr / bilm.py
Created May 14, 2020 04:35
A simple bidirectional language model with nltk
from typing import List, Dict, Set, Optional
from nltk.lm import MLE
from nltk.util import ngrams
class InvalidOrderException(Exception):
pass
class InvalidContextSizeException(Exception):
pass
@kzinmr
kzinmr / window_cooccurence.py
Last active May 19, 2020 01:52
Generate window contexts and count cooccurence within them.
from typing import List
from itertools import tee, combinations
from collections import Counter
# def count_cooccurrence_in_window(context_window, delimiter=' '):
# return Counter([delimiter.join(bi) for bi in combinations(context_window, 2)])
def window_cooccurrence(sentence: List[str], window: int = 5) -> Counter:
from itertools import groupby
from operator import itemgetter
data = [ 1, 4,5,6, 10, 15,16,17,18, 22, 25,26,27,28]
consecutive_ints = [map(itemgetter(1), g) for k, g in groupby(enumerate(data), lambda (i, x): i-x)]
# [[1],
# [4, 5, 6],
# [10],
# [15, 16, 17, 18],
# [22],
# [25, 26, 27, 28]]
import regex
from typing import List, Tuple
def fuzzy_match_spans(annotation: str, text: str, max_num_errors: int = 5) -> List[Tuple[int, int]]:
r = regex.compile('(%s)' % re.escape(annotation))
matches = [m.span() for m in r.finditer(text)]
num_errors = 1
while not matches and num_errors <= max_num_errors:
r = regex.compile('(%s){e<=%d}' % (re.escape(annotation), num_errors))