kzinmr’s gists

kzinmr / extractdocx.py

Created July 16, 2019 03:52 — forked from etienned/extractdocx.py

Simple function to extract text from MS XML Word document (.docx) without any dependencies.

	try:
	from xml.etree.cElementTree import XML
	except ImportError:
	from xml.etree.ElementTree import XML
	import zipfile


	"""
	Module that extract text from MS XML Word document (.docx).
	(Inspired by python-docx <https://github.com/mikemaccana/python-docx>)

kzinmr / pmi.py

Last active September 10, 2024 10:07

PMI calculation

	"calculate PMI(A,B)=P(A,B)/P(A)P(B) for every token A and B in a window"
	from itertools import tee, combinations
	from collections import Counter


	def count_bigram(sentence, window=5):
	# ['A','B','C','D', 'E', 'F', 'G'], 4 ->
	# [['A', 'B', 'C', 'D'],
	# ['B', 'C', 'D', 'E'],
	# ['C', 'D', 'E', 'F'],

kzinmr / gist:80d69fe2aff6113d70f75a172b57233f

Created January 16, 2020 07:51

a

kzinmr / calculate_idf_score.py

Last active September 10, 2024 10:06


	from collections import defaultdict
	from functools import reduce, partial
	import numpy as np
	from itertools import chain


	def flatten(l):
	return list(chain.from_iterable(l))

kzinmr / cluster_df.py

Last active September 10, 2024 10:05

	from collections import defaultdict, Counter
	from operator import add
	from functools import reduce
	import numpy as np

	from sklearn.cluster import KMeans


	def dict_of_list(keys, values):
	assert(len(keys) == len(values))

kzinmr / shokugyo_html2json.py

Last active April 26, 2020 12:52

	from collections import OrderedDict
	from html.parser import HTMLParser
	import json


	class MyHTMLParser(HTMLParser):

	debug = False

	def __init__(self):

kzinmr / bilm.py

Created May 14, 2020 04:35

A simple bidirectional language model with nltk

	from typing import List, Dict, Set, Optional
	from nltk.lm import MLE
	from nltk.util import ngrams


	class InvalidOrderException(Exception):
	pass

	class InvalidContextSizeException(Exception):
	pass

kzinmr / window_cooccurence.py

Last active May 19, 2020 01:52

Generate window contexts and count cooccurence within them.

	from typing import List
	from itertools import tee, combinations
	from collections import Counter


	# def count_cooccurrence_in_window(context_window, delimiter=' '):
	# return Counter([delimiter.join(bi) for bi in combinations(context_window, 2)])


	def window_cooccurrence(sentence: List[str], window: int = 5) -> Counter:

kzinmr / consecutive_integers.py

Created May 18, 2020 00:25

	from itertools import groupby
	from operator import itemgetter
	data = [ 1, 4,5,6, 10, 15,16,17,18, 22, 25,26,27,28]
	consecutive_ints = [map(itemgetter(1), g) for k, g in groupby(enumerate(data), lambda (i, x): i-x)]
	# [[1],
	# [4, 5, 6],
	# [10],
	# [15, 16, 17, 18],
	# [22],
	# [25, 26, 27, 28]]

kzinmr / fuzzy_match_spans.py

Created May 21, 2020 10:57

fuzzy matching using https://pypi.org/project/regex/

	import regex
	from typing import List, Tuple


	def fuzzy_match_spans(annotation: str, text: str, max_num_errors: int = 5) -> List[Tuple[int, int]]:
	r = regex.compile('(%s)' % re.escape(annotation))
	matches = [m.span() for m in r.finditer(text)]
	num_errors = 1
	while not matches and num_errors <= max_num_errors:
	r = regex.compile('(%s){e<=%d}' % (re.escape(annotation), num_errors))

Kazuki Inamura kzinmr