This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import regex | |
| from typing import List, Tuple | |
| def fuzzy_match_spans(annotation: str, text: str, max_num_errors: int = 5) -> List[Tuple[int, int]]: | |
| r = regex.compile('(%s)' % re.escape(annotation)) | |
| matches = [m.span() for m in r.finditer(text)] | |
| num_errors = 1 | |
| while not matches and num_errors <= max_num_errors: | |
| r = regex.compile('(%s){e<=%d}' % (re.escape(annotation), num_errors)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from itertools import groupby | |
| from operator import itemgetter | |
| data = [ 1, 4,5,6, 10, 15,16,17,18, 22, 25,26,27,28] | |
| consecutive_ints = [map(itemgetter(1), g) for k, g in groupby(enumerate(data), lambda (i, x): i-x)] | |
| # [[1], | |
| # [4, 5, 6], | |
| # [10], | |
| # [15, 16, 17, 18], | |
| # [22], | |
| # [25, 26, 27, 28]] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from typing import List | |
| from itertools import tee, combinations | |
| from collections import Counter | |
| # def count_cooccurrence_in_window(context_window, delimiter=' '): | |
| # return Counter([delimiter.join(bi) for bi in combinations(context_window, 2)]) | |
| def window_cooccurrence(sentence: List[str], window: int = 5) -> Counter: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from typing import List, Dict, Set, Optional | |
| from nltk.lm import MLE | |
| from nltk.util import ngrams | |
| class InvalidOrderException(Exception): | |
| pass | |
| class InvalidContextSizeException(Exception): | |
| pass |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from collections import OrderedDict | |
| from html.parser import HTMLParser | |
| import json | |
| class MyHTMLParser(HTMLParser): | |
| debug = False | |
| def __init__(self): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from collections import defaultdict, Counter | |
| from operator import add | |
| from functools import reduce | |
| import numpy as np | |
| from sklearn.cluster import KMeans | |
| def dict_of_list(keys, values): | |
| assert(len(keys) == len(values)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from collections import defaultdict | |
| from functools import reduce, partial | |
| import numpy as np | |
| from itertools import chain | |
| def flatten(l): | |
| return list(chain.from_iterable(l)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| a |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| "calculate PMI(A,B)=P(A,B)/P(A)P(B) for every token A and B in a window" | |
| from itertools import tee, combinations | |
| from collections import Counter | |
| def count_bigram(sentence, window=5): | |
| # ['A','B','C','D', 'E', 'F', 'G'], 4 -> | |
| # [['A', 'B', 'C', 'D'], | |
| # ['B', 'C', 'D', 'E'], | |
| # ['C', 'D', 'E', 'F'], |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| try: | |
| from xml.etree.cElementTree import XML | |
| except ImportError: | |
| from xml.etree.ElementTree import XML | |
| import zipfile | |
| """ | |
| Module that extract text from MS XML Word document (.docx). | |
| (Inspired by python-docx <https://github.com/mikemaccana/python-docx>) |