This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from itertools import groupby | |
from operator import itemgetter | |
data = [ 1, 4,5,6, 10, 15,16,17,18, 22, 25,26,27,28] | |
consecutive_ints = [map(itemgetter(1), g) for k, g in groupby(enumerate(data), lambda (i, x): i-x)] | |
# [[1], | |
# [4, 5, 6], | |
# [10], | |
# [15, 16, 17, 18], | |
# [22], | |
# [25, 26, 27, 28]] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import List | |
from itertools import tee, combinations | |
from collections import Counter | |
# def count_cooccurrence_in_window(context_window, delimiter=' '): | |
# return Counter([delimiter.join(bi) for bi in combinations(context_window, 2)]) | |
def window_cooccurrence(sentence: List[str], window: int = 5) -> Counter: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import List, Dict, Set, Optional | |
from nltk.lm import MLE | |
from nltk.util import ngrams | |
class InvalidOrderException(Exception): | |
pass | |
class InvalidContextSizeException(Exception): | |
pass |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import OrderedDict | |
from html.parser import HTMLParser | |
import json | |
class MyHTMLParser(HTMLParser): | |
debug = False | |
def __init__(self): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict, Counter | |
from operator import add | |
from functools import reduce | |
import numpy as np | |
from sklearn.cluster import KMeans | |
def dict_of_list(keys, values): | |
assert(len(keys) == len(values)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict | |
from functools import reduce, partial | |
import numpy as np | |
from itertools import chain | |
def flatten(l): | |
return list(chain.from_iterable(l)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
a |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"calculate PMI(A,B)=P(A,B)/P(A)P(B) for every token A and B in a window" | |
from itertools import tee, combinations | |
from collections import Counter | |
def count_bigram(sentence, window=5): | |
# ['A','B','C','D', 'E', 'F', 'G'], 4 -> | |
# [['A', 'B', 'C', 'D'], | |
# ['B', 'C', 'D', 'E'], | |
# ['C', 'D', 'E', 'F'], |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
try: | |
from xml.etree.cElementTree import XML | |
except ImportError: | |
from xml.etree.ElementTree import XML | |
import zipfile | |
""" | |
Module that extract text from MS XML Word document (.docx). | |
(Inspired by python-docx <https://github.com/mikemaccana/python-docx>) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df = df.set_index('date') | |
for date, new_df in df.groupby(level=0): | |
print(new_df) |