Created
October 29, 2024 15:01
-
-
Save MeMartijn/8d66a181f32304de9c07c2529649c35b to your computer and use it in GitHub Desktop.
Jina AI's Segmenter ported to Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import regex | |
from typing import List | |
# Define constants | |
MAX_HEADING_LENGTH = 7 | |
MAX_HEADING_CONTENT_LENGTH = 200 | |
MAX_HEADING_UNDERLINE_LENGTH = 200 | |
MAX_HTML_HEADING_ATTRIBUTES_LENGTH = 100 | |
MAX_LIST_ITEM_LENGTH = 200 | |
MAX_NESTED_LIST_ITEMS = 6 | |
MAX_LIST_INDENT_SPACES = 7 | |
MAX_BLOCKQUOTE_LINE_LENGTH = 200 | |
MAX_BLOCKQUOTE_LINES = 15 | |
MAX_CODE_BLOCK_LENGTH = 1500 | |
MAX_CODE_LANGUAGE_LENGTH = 20 | |
MAX_INDENTED_CODE_LINES = 20 | |
MAX_TABLE_CELL_LENGTH = 200 | |
MAX_TABLE_ROWS = 20 | |
MAX_HTML_TABLE_LENGTH = 2000 | |
MIN_HORIZONTAL_RULE_LENGTH = 3 | |
MAX_SENTENCE_LENGTH = 400 | |
MAX_QUOTED_TEXT_LENGTH = 300 | |
MAX_PARENTHETICAL_CONTENT_LENGTH = 200 | |
MAX_NESTED_PARENTHESES = 5 | |
MAX_MATH_INLINE_LENGTH = 100 | |
MAX_MATH_BLOCK_LENGTH = 500 | |
MAX_PARAGRAPH_LENGTH = 1000 | |
MAX_STANDALONE_LINE_LENGTH = 800 | |
MAX_HTML_TAG_ATTRIBUTES_LENGTH = 100 | |
MAX_HTML_TAG_CONTENT_LENGTH = 1000 | |
LOOKAHEAD_RANGE = 100 | |
# Base patterns | |
AVOID_AT_START = r'[\s\]})>,\']' | |
PUNCTUATION = r'[.!?…]|\.{3}|[\u2026\u2047-\u2049]|[\p{Emoji_Presentation}\p{Extended_Pictographic}]' | |
QUOTE_END = r"(?:'(?=`)|''(?=``))" | |
SENTENCE_END = f"(?:{PUNCTUATION}(?<!{AVOID_AT_START}(?={PUNCTUATION}))|{QUOTE_END})(?=\\s|$)" | |
SENTENCE_BOUNDARY = f"(?:{SENTENCE_END}|(?=[\\r\\n]|$))" | |
LOOKAHEAD_PATTERN = f"(?:(?!{SENTENCE_END}).){1,{LOOKAHEAD_RANGE}}{SENTENCE_END}" | |
NOT_PUNCTUATION_SPACE = f"(?!{PUNCTUATION}\\s)" | |
def get_sentence_pattern(max_length: int) -> str: | |
return f"(?:{NOT_PUNCTUATION_SPACE}(?:[^\\r\\n]{{1,{max_length}}}{SENTENCE_BOUNDARY}|[^\\r\\n]{{1,{max_length}}}(?={PUNCTUATION}|{QUOTE_END})(?:{LOOKAHEAD_PATTERN})?){AVOID_AT_START}*)" | |
# Individual patterns | |
HEADING_PATTERN = f"(?:^(?:[#*=-]{{1,{MAX_HEADING_LENGTH}}}|\\w[^\\r\\n]{{0,{MAX_HEADING_CONTENT_LENGTH}}}\\r?\\n[-=]{{2,{MAX_HEADING_UNDERLINE_LENGTH}}}|<h[1-6][^>]{{0,{MAX_HTML_HEADING_ATTRIBUTES_LENGTH}}}>)[^\\r\\n]{{1,{MAX_HEADING_CONTENT_LENGTH}}}(?:</h[1-6]>)?(?:\\r?\\n|$))" | |
LIST_PATTERN = f"(?:(?:^|\\r?\\n)[ \\t]{{0,3}}(?:[-*+•]|\\d+\\.\\s|\\[[ xX]\\])\\s+[^\\r\\n]{{1,{MAX_LIST_ITEM_LENGTH}}}(?:\\r?\\n[ \\t]{{2,{MAX_LIST_INDENT_SPACES}}}[^\\r\\n]{{1,{MAX_LIST_ITEM_LENGTH}}})*)" | |
BLOCKQUOTE_PATTERN = f"(?:(?:^>\\s?[^\\r\\n]{{1,{MAX_BLOCKQUOTE_LINE_LENGTH}}}\\r?\\n){{1,{MAX_BLOCKQUOTE_LINES}}})" | |
CODE_BLOCK_PATTERN = f"(?:(?:^|\\r?\\n)```(?:\\w{{0,{MAX_CODE_LANGUAGE_LENGTH}}})?\\r?\\n[\\s\\S]{{0,{MAX_CODE_BLOCK_LENGTH}}}?```(?:\\r?\\n|$)|(?:(?:^|\\r?\\n)(?: |\\t)[^\\r\\n]{{0,{MAX_CODE_BLOCK_LENGTH}}}(?:\\r?\\n|$)){{1,{MAX_INDENTED_CODE_LINES}}})" | |
TABLE_PATTERN = f"(?:(?:^|\\r?\\n)(?:\\|[^\\r\\n]{{0,{MAX_TABLE_CELL_LENGTH}}}\\|(?:\\r?\\n\\|[-:]{{1,{MAX_TABLE_CELL_LENGTH}}}\\|){{0,1}}(?:\\r?\\n\\|[^\\r\\n]{{0,{MAX_TABLE_CELL_LENGTH}}}\\|){{0,{MAX_TABLE_ROWS}}}|<table>[\\s\\S]{{0,{MAX_HTML_TABLE_LENGTH}}}?</table>))" | |
PARAGRAPH_PATTERN = f"(?:(?:^|\\r?\\n\\r?\\n)(?![-*+>\\s]|\\d+\\.)[^\\r\\n]{{1,{MAX_PARAGRAPH_LENGTH}}}(?:\\r?\\n(?!\\r?\\n|[-*+>\\s]|\\d+\\.)[^\\r\\n]{{1,{MAX_PARAGRAPH_LENGTH}}})*)(?:\\r?\\n\\r?\\n|$)" | |
# Combine all patterns | |
MAIN_PATTERN = f"{HEADING_PATTERN}|{LIST_PATTERN}|{BLOCKQUOTE_PATTERN}|{CODE_BLOCK_PATTERN}|{TABLE_PATTERN}|{PARAGRAPH_PATTERN}" | |
def chunk_text(text: str) -> List[str]: | |
""" | |
Tokenize the input text into meaningful chunks using regex patterns. | |
Args: | |
text (str): Input text to tokenize | |
Returns: | |
List[str]: List of text chunks | |
""" | |
# Pre-process text to ensure consistent line endings | |
text = text.replace('\r\n', '\n').replace('\r', '\n') | |
# Find all matches | |
chunks = regex.findall(MAIN_PATTERN, text, regex.MULTILINE) | |
# Filter out empty matches and strip whitespace | |
chunks = [chunk.strip() for chunk in chunks if chunk.strip()] | |
return chunks |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment