Created
January 12, 2023 13:20
-
-
Save king-menin/1dad74b2d552bc05ecfafc284996943a to your computer and use it in GitHub Desktop.
Chunks for Alenush
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from transformers import AutoTokenizer | |
from razdel import sentenize | |
from typing import List | |
def sent_split(sent): | |
return sent.split() | |
def separate_big_sent_by_spaces(whole_text: str, tokenizer, limit_length: int = 512, sent_split=sent_split) -> List[str]: | |
""" | |
If one sentence is huge - we split it by spaces! | |
Return array of words. Use this function after sentence split | |
""" | |
limit_length -= 1 | |
chunks_sent = [] # final chunks by spaces | |
words = sent_split(whole_text) | |
temp_chunk_sent = [] | |
subtokens_num = 0 | |
for idx, word in enumerate(words): | |
if idx: | |
word = f" {word}" | |
encoded_word = tokenizer.encode(word, add_special_tokens=False) | |
word_tokens_len = len(encoded_word) | |
if word_tokens_len < limit_length: | |
if len(temp_chunk_sent) + word_tokens_len < limit_length: | |
temp_chunk_sent.extend(encoded_word) | |
else: | |
chunks_sent.append(tokenizer.decode(temp_chunk_sent)) | |
temp_chunk_sent = encoded_word | |
else: | |
temp_chunk_sent.extend(encoded_word) | |
for idx in range(0, len(temp_chunk_sent), limit_length): | |
if limit_length < len(temp_chunk_sent) + 1: | |
chunks_sent.append(tokenizer.decode(temp_chunk_sent[:limit_length])) | |
temp_chunk_sent = temp_chunk_sent[limit_length:] | |
if len(temp_chunk_sent): | |
chunks_sent.append(tokenizer.decode(temp_chunk_sent)) | |
return chunks_sent | |
def separate_text(whole_text: str, tokenizer, limit_length: int = 512, sent_split=sent_split) -> List[str]: | |
""" | |
Split the text on chunks. First big text on sentences. | |
Chunks should always fit to the 512 subtokens (or set context) | |
If it's one huge sentence that is more than limit, cut it by subtokens! | |
Return: array of splitted text chunks. | |
""" | |
limit_length -= 1 | |
chunks = [] # final chunks are here | |
sentences = [sent.text for sent in sentenize(whole_text)] | |
if len(sentences) == 0: | |
return [whole_text] | |
temp_chunk = [] | |
for sent in sentences: | |
encoded_sent = tokenizer.encode(sent, add_special_tokens=False) | |
sent_tokens_len = len(encoded_sent) | |
if sent_tokens_len < limit_length: | |
if len(temp_chunk) + sent_tokens_len < limit_length: | |
temp_chunk.extend(encoded_sent) | |
else: | |
chunks.append(tokenizer.decode(temp_chunk)) | |
temp_chunk = encoded_sent | |
else: | |
if len(temp_chunk): | |
chunks.append(tokenizer.decode(temp_chunk)) | |
temp_chunk = [] | |
chunked_sentence = separate_big_sent_by_spaces(sent, tokenizer, limit_length + 1, sent_split=sent_split) | |
chunks.extend(chunked_sentence) | |
if len(temp_chunk): | |
chunks.append(tokenizer.decode(temp_chunk)) | |
return chunks | |
tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/sbert_large_mt_nlu_ru") | |
separate_big_sent_by_spaces("улоываиывдиотфви дофаивотфуивотфво", tokenizer, 15) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment