king-menin · January 12, 2023 13:20
diff --git a/chunks.py b/chunks.py
 from transformers import AutoTokenizer
 from razdel import sentenize
 from typing import List


 def sent_split(sent):
    return sent.split()

   
 def separate_big_sent_by_spaces(whole_text: str, tokenizer, limit_length: int = 512, sent_split=sent_split) -> List[str]:
    """ 
    If one sentence is huge - we split it by spaces!
    Return array of words. Use this function after sentence split
    """
    limit_length -= 1
    chunks_sent = [] # final chunks by spaces
    words = sent_split(whole_text)

    temp_chunk_sent = []
    subtokens_num = 0

    for idx, word in enumerate(words):
        if idx:
            word = f" {word}"
        encoded_word = tokenizer.encode(word, add_special_tokens=False)
        word_tokens_len = len(encoded_word)
        if word_tokens_len < limit_length:
            if len(temp_chunk_sent) + word_tokens_len < limit_length:
                temp_chunk_sent.extend(encoded_word)
            else:
                chunks_sent.append(tokenizer.decode(temp_chunk_sent))
                temp_chunk_sent = encoded_word
        else:
            temp_chunk_sent.extend(encoded_word)
            for idx in range(0, len(temp_chunk_sent), limit_length):
                if limit_length < len(temp_chunk_sent) + 1:
                    chunks_sent.append(tokenizer.decode(temp_chunk_sent[:limit_length]))
                    temp_chunk_sent = temp_chunk_sent[limit_length:]
    if len(temp_chunk_sent):
        chunks_sent.append(tokenizer.decode(temp_chunk_sent))

    return chunks_sent

    
 def separate_text(whole_text: str, tokenizer, limit_length: int = 512, sent_split=sent_split) -> List[str]:
    """ 
    Split the text on chunks. First big text on sentences.
    Chunks should always fit to the 512 subtokens (or set context)
    If it's one huge sentence that is more than limit, cut it by subtokens!
    Return: array of splitted text chunks.
    """
    limit_length -= 1
    chunks = []  # final chunks are here
    sentences = [sent.text for sent in sentenize(whole_text)]
    
    if len(sentences) == 0:
        return [whole_text]
    
    temp_chunk = []
    for sent in sentences:
        encoded_sent = tokenizer.encode(sent, add_special_tokens=False)
        sent_tokens_len = len(encoded_sent)
        if sent_tokens_len < limit_length:
            if len(temp_chunk) + sent_tokens_len < limit_length:
                temp_chunk.extend(encoded_sent)
            else:
                chunks.append(tokenizer.decode(temp_chunk))
                temp_chunk = encoded_sent
        else:
            if len(temp_chunk):
                chunks.append(tokenizer.decode(temp_chunk))
                temp_chunk = []
            chunked_sentence = separate_big_sent_by_spaces(sent, tokenizer, limit_length + 1, sent_split=sent_split)
            chunks.extend(chunked_sentence)
    if len(temp_chunk):
        chunks.append(tokenizer.decode(temp_chunk))
    return chunks


 tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/sbert_large_mt_nlu_ru")
 separate_big_sent_by_spaces("улоываиывдиотфви дофаивотфуивотфво", tokenizer, 15)
	from transformers import AutoTokenizer
	from razdel import sentenize
	from typing import List


	def sent_split(sent):
	return sent.split()


	def separate_big_sent_by_spaces(whole_text: str, tokenizer, limit_length: int = 512, sent_split=sent_split) -> List[str]:
	"""
	If one sentence is huge - we split it by spaces!
	Return array of words. Use this function after sentence split
	"""
	limit_length -= 1
	chunks_sent = [] # final chunks by spaces
	words = sent_split(whole_text)

	temp_chunk_sent = []
	subtokens_num = 0

	for idx, word in enumerate(words):
	if idx:
	word = f" {word}"
	encoded_word = tokenizer.encode(word, add_special_tokens=False)
	word_tokens_len = len(encoded_word)
	if word_tokens_len < limit_length:
	if len(temp_chunk_sent) + word_tokens_len < limit_length:
	temp_chunk_sent.extend(encoded_word)
	else:
	chunks_sent.append(tokenizer.decode(temp_chunk_sent))
	temp_chunk_sent = encoded_word
	else:
	temp_chunk_sent.extend(encoded_word)
	for idx in range(0, len(temp_chunk_sent), limit_length):
	if limit_length < len(temp_chunk_sent) + 1:
	chunks_sent.append(tokenizer.decode(temp_chunk_sent[:limit_length]))
	temp_chunk_sent = temp_chunk_sent[limit_length:]
	if len(temp_chunk_sent):
	chunks_sent.append(tokenizer.decode(temp_chunk_sent))

	return chunks_sent


	def separate_text(whole_text: str, tokenizer, limit_length: int = 512, sent_split=sent_split) -> List[str]:
	"""
	Split the text on chunks. First big text on sentences.
	Chunks should always fit to the 512 subtokens (or set context)
	If it's one huge sentence that is more than limit, cut it by subtokens!
	Return: array of splitted text chunks.
	"""
	limit_length -= 1
	chunks = [] # final chunks are here
	sentences = [sent.text for sent in sentenize(whole_text)]

	if len(sentences) == 0:
	return [whole_text]

	temp_chunk = []
	for sent in sentences:
	encoded_sent = tokenizer.encode(sent, add_special_tokens=False)
	sent_tokens_len = len(encoded_sent)
	if sent_tokens_len < limit_length:
	if len(temp_chunk) + sent_tokens_len < limit_length:
	temp_chunk.extend(encoded_sent)
	else:
	chunks.append(tokenizer.decode(temp_chunk))
	temp_chunk = encoded_sent
	else:
	if len(temp_chunk):
	chunks.append(tokenizer.decode(temp_chunk))
	temp_chunk = []
	chunked_sentence = separate_big_sent_by_spaces(sent, tokenizer, limit_length + 1, sent_split=sent_split)
	chunks.extend(chunked_sentence)
	if len(temp_chunk):
	chunks.append(tokenizer.decode(temp_chunk))
	return chunks


	tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/sbert_large_mt_nlu_ru")
	separate_big_sent_by_spaces("улоываиывдиотфви дофаивотфуивотфво", tokenizer, 15)