Skip to content

Instantly share code, notes, and snippets.

@king-menin
Created January 12, 2023 13:20
Show Gist options
  • Save king-menin/1dad74b2d552bc05ecfafc284996943a to your computer and use it in GitHub Desktop.
Save king-menin/1dad74b2d552bc05ecfafc284996943a to your computer and use it in GitHub Desktop.
Chunks for Alenush
from transformers import AutoTokenizer
from razdel import sentenize
from typing import List
def sent_split(sent):
return sent.split()
def separate_big_sent_by_spaces(whole_text: str, tokenizer, limit_length: int = 512, sent_split=sent_split) -> List[str]:
"""
If one sentence is huge - we split it by spaces!
Return array of words. Use this function after sentence split
"""
limit_length -= 1
chunks_sent = [] # final chunks by spaces
words = sent_split(whole_text)
temp_chunk_sent = []
subtokens_num = 0
for idx, word in enumerate(words):
if idx:
word = f" {word}"
encoded_word = tokenizer.encode(word, add_special_tokens=False)
word_tokens_len = len(encoded_word)
if word_tokens_len < limit_length:
if len(temp_chunk_sent) + word_tokens_len < limit_length:
temp_chunk_sent.extend(encoded_word)
else:
chunks_sent.append(tokenizer.decode(temp_chunk_sent))
temp_chunk_sent = encoded_word
else:
temp_chunk_sent.extend(encoded_word)
for idx in range(0, len(temp_chunk_sent), limit_length):
if limit_length < len(temp_chunk_sent) + 1:
chunks_sent.append(tokenizer.decode(temp_chunk_sent[:limit_length]))
temp_chunk_sent = temp_chunk_sent[limit_length:]
if len(temp_chunk_sent):
chunks_sent.append(tokenizer.decode(temp_chunk_sent))
return chunks_sent
def separate_text(whole_text: str, tokenizer, limit_length: int = 512, sent_split=sent_split) -> List[str]:
"""
Split the text on chunks. First big text on sentences.
Chunks should always fit to the 512 subtokens (or set context)
If it's one huge sentence that is more than limit, cut it by subtokens!
Return: array of splitted text chunks.
"""
limit_length -= 1
chunks = [] # final chunks are here
sentences = [sent.text for sent in sentenize(whole_text)]
if len(sentences) == 0:
return [whole_text]
temp_chunk = []
for sent in sentences:
encoded_sent = tokenizer.encode(sent, add_special_tokens=False)
sent_tokens_len = len(encoded_sent)
if sent_tokens_len < limit_length:
if len(temp_chunk) + sent_tokens_len < limit_length:
temp_chunk.extend(encoded_sent)
else:
chunks.append(tokenizer.decode(temp_chunk))
temp_chunk = encoded_sent
else:
if len(temp_chunk):
chunks.append(tokenizer.decode(temp_chunk))
temp_chunk = []
chunked_sentence = separate_big_sent_by_spaces(sent, tokenizer, limit_length + 1, sent_split=sent_split)
chunks.extend(chunked_sentence)
if len(temp_chunk):
chunks.append(tokenizer.decode(temp_chunk))
return chunks
tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/sbert_large_mt_nlu_ru")
separate_big_sent_by_spaces("улоываиывдиотфви дофаивотфуивотфво", tokenizer, 15)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment