Last active
April 4, 2020 18:52
-
-
Save maciejskorski/4dcc81b0a995e0eaf701b2c36b8895a1 to your computer and use it in GitHub Desktop.
skipgram generator
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from itertools import islice,chain | |
from collections import deque | |
def gen_skipgrams(itr,window=1,symmetric=False,Q=None): | |
itr = iter(itr) | |
if not Q: | |
Q = deque(islice(itr,window-1),maxlen=window) | |
append = Q.append | |
for i in itr: | |
for j in Q: | |
yield (j,i) | |
if symmetric: | |
yield (i,j) | |
append(i) | |
# text as an iterable over sentences | |
text = 'I eat oat flakes every day. This is my beloved breakfast. It is also very healthy. ' | |
sentences = text.split('. ') | |
# iterate over sentences and then over words. this way we do not cross sentence boundaries !!! | |
words = map(str.split, sentences) | |
pairs = map(lambda sent: gen_skipgrams(sent,1,deque()),words) | |
pairs = chain.from_iterable(pairs) | |
for p in pairs: | |
print(p) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment