Created
August 8, 2020 11:11
-
-
Save ksopyla/d9d7bf1eda1a426ecff9fe2b40969dbc to your computer and use it in GitHub Desktop.
Pure python char ngram tokenizers: sequence and generators
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#%% | |
from collections import Counter, OrderedDict | |
from itertools import zip_longest , tee | |
## ngram iterators and tokenizers, working on list or generators | |
def ngram_tokenizer_iter(iterable, n, fillvalue=''): | |
"generuje pary znaków obok siebie, tokenizuje [abcd]->ab, cd dla tekstu przekazanego w formie generatora" | |
# grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx" | |
args = [iter(iterable)] * n | |
zip_tuples = zip_longest(*args, fillvalue=fillvalue) | |
for tup in zip_tuples: | |
yield "".join(tup) | |
def ngram_tokenizer(ngrams): | |
'''generuje pary znaków obok siebie, tokenizuje [abcd]->ab, cd | |
''' | |
def func(text): | |
return (text[pos:pos + ngrams] for pos in range(0, len(text), ngrams)) | |
return func | |
def ngram_vocab_gen(ngrams): | |
'''generuje wszystkie ngramy [abcd]->ab, bc, cd, d | |
''' | |
def func(text): | |
return (text[i:i+ngrams] for i in range(len(text)+1-ngrams)) | |
return func | |
#analog dla ngram_vocab_gen można zrobić z | |
##https://stackoverflow.com/questions/5434891/iterate-a-list-as-pair-current-next-in-python | |
def ngram_vocab_gen_iter(iterator,ngrams): | |
"s -> (s0,s1,s2), (s1,s2,s3), (s2, s3,4), ..." | |
iter_tuple = tee(iterator, ngrams) | |
nested_iter_next=0 | |
list_of_iters = [] | |
for i,one_iter in enumerate(iter_tuple): | |
for _ in range(nested_iter_next): | |
next(one_iter,"") | |
list_of_iters.append(one_iter) | |
nested_iter_next+=1 | |
for tup in zip(*list_of_iters): | |
yield "".join(tup) | |
#%% | |
dataset_text = "aabbcc ddaaa aacca caca baaba baac " #dataset z znaków a,b,c aby było prościej :) | |
# dataset_text = "Twój długi tekst, najczęściej scalony cały dataset to jednego stringa, albo jego część, albo generator odczytujący z pliku linie po lini " | |
#%% All possible char-bi-grams | |
a=list(ngram_vocab_gen(2)(dataset_text)) | |
print(f'{len(a)} {a[0:10]}') | |
#%% | |
a=list(ngram_vocab_gen_iter(dataset_text,2)) | |
print(f'{len(a)} {a[0:10]}') | |
#%% bi-gram tokenizer | |
a=list(ngram_tokenizer_iter(dataset_text,2)) | |
print(f'{len(a)} {a[0:10]}') | |
#%% | |
a=list(ngram_tokenizer(2)(dataset_text)) | |
print(f'{len(a)} {a[0:10]}') | |
#%% testing speed and accuracy of ngram tokenization and ngram seed generation | |
import timeit | |
import numpy as np | |
SETUP_CODE = ''' | |
from __main__ import ngram_tokenizer_iter, ngram_tokenizer,ngram_vocab_gen, ngram_vocab_gen_iter | |
from __main__ import dataset_text | |
''' | |
CODE1=''' | |
a=list(ngram_tokenizer_iter(dataset_text,2)) | |
''' | |
CODE2=''' | |
a=list(ngram_tokenizer(2)(dataset_text)) | |
''' | |
CODE3=''' | |
a=list(ngram_vocab_gen(2)(dataset_text)) | |
''' | |
CODE4=''' | |
a=list(ngram_vocab_gen_iter(dataset_text,2)) | |
''' | |
print(f'{CODE1} time={np.mean(timeit.repeat(CODE1,SETUP_CODE, repeat=3,number=10))}') | |
print(f'{CODE2} time={np.mean(timeit.repeat(CODE2,SETUP_CODE, repeat=3,number=10))}') | |
print(f'{CODE3} time={np.mean(timeit.repeat(CODE3,SETUP_CODE, repeat=3,number=10))}') | |
print(f'{CODE4} time={np.mean(timeit.repeat(CODE4,SETUP_CODE, repeat=3,number=10))}') | |
# %% |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment