Skip to content

Instantly share code, notes, and snippets.

@MeMartijn
Created October 29, 2024 15:01
Show Gist options
  • Save MeMartijn/8d66a181f32304de9c07c2529649c35b to your computer and use it in GitHub Desktop.
Save MeMartijn/8d66a181f32304de9c07c2529649c35b to your computer and use it in GitHub Desktop.
Jina AI's Segmenter ported to Python
import regex
from typing import List
# Define constants
MAX_HEADING_LENGTH = 7
MAX_HEADING_CONTENT_LENGTH = 200
MAX_HEADING_UNDERLINE_LENGTH = 200
MAX_HTML_HEADING_ATTRIBUTES_LENGTH = 100
MAX_LIST_ITEM_LENGTH = 200
MAX_NESTED_LIST_ITEMS = 6
MAX_LIST_INDENT_SPACES = 7
MAX_BLOCKQUOTE_LINE_LENGTH = 200
MAX_BLOCKQUOTE_LINES = 15
MAX_CODE_BLOCK_LENGTH = 1500
MAX_CODE_LANGUAGE_LENGTH = 20
MAX_INDENTED_CODE_LINES = 20
MAX_TABLE_CELL_LENGTH = 200
MAX_TABLE_ROWS = 20
MAX_HTML_TABLE_LENGTH = 2000
MIN_HORIZONTAL_RULE_LENGTH = 3
MAX_SENTENCE_LENGTH = 400
MAX_QUOTED_TEXT_LENGTH = 300
MAX_PARENTHETICAL_CONTENT_LENGTH = 200
MAX_NESTED_PARENTHESES = 5
MAX_MATH_INLINE_LENGTH = 100
MAX_MATH_BLOCK_LENGTH = 500
MAX_PARAGRAPH_LENGTH = 1000
MAX_STANDALONE_LINE_LENGTH = 800
MAX_HTML_TAG_ATTRIBUTES_LENGTH = 100
MAX_HTML_TAG_CONTENT_LENGTH = 1000
LOOKAHEAD_RANGE = 100
# Base patterns
AVOID_AT_START = r'[\s\]})>,\']'
PUNCTUATION = r'[.!?…]|\.{3}|[\u2026\u2047-\u2049]|[\p{Emoji_Presentation}\p{Extended_Pictographic}]'
QUOTE_END = r"(?:'(?=`)|''(?=``))"
SENTENCE_END = f"(?:{PUNCTUATION}(?<!{AVOID_AT_START}(?={PUNCTUATION}))|{QUOTE_END})(?=\\s|$)"
SENTENCE_BOUNDARY = f"(?:{SENTENCE_END}|(?=[\\r\\n]|$))"
LOOKAHEAD_PATTERN = f"(?:(?!{SENTENCE_END}).){1,{LOOKAHEAD_RANGE}}{SENTENCE_END}"
NOT_PUNCTUATION_SPACE = f"(?!{PUNCTUATION}\\s)"
def get_sentence_pattern(max_length: int) -> str:
return f"(?:{NOT_PUNCTUATION_SPACE}(?:[^\\r\\n]{{1,{max_length}}}{SENTENCE_BOUNDARY}|[^\\r\\n]{{1,{max_length}}}(?={PUNCTUATION}|{QUOTE_END})(?:{LOOKAHEAD_PATTERN})?){AVOID_AT_START}*)"
# Individual patterns
HEADING_PATTERN = f"(?:^(?:[#*=-]{{1,{MAX_HEADING_LENGTH}}}|\\w[^\\r\\n]{{0,{MAX_HEADING_CONTENT_LENGTH}}}\\r?\\n[-=]{{2,{MAX_HEADING_UNDERLINE_LENGTH}}}|<h[1-6][^>]{{0,{MAX_HTML_HEADING_ATTRIBUTES_LENGTH}}}>)[^\\r\\n]{{1,{MAX_HEADING_CONTENT_LENGTH}}}(?:</h[1-6]>)?(?:\\r?\\n|$))"
LIST_PATTERN = f"(?:(?:^|\\r?\\n)[ \\t]{{0,3}}(?:[-*+•]|\\d+\\.\\s|\\[[ xX]\\])\\s+[^\\r\\n]{{1,{MAX_LIST_ITEM_LENGTH}}}(?:\\r?\\n[ \\t]{{2,{MAX_LIST_INDENT_SPACES}}}[^\\r\\n]{{1,{MAX_LIST_ITEM_LENGTH}}})*)"
BLOCKQUOTE_PATTERN = f"(?:(?:^>\\s?[^\\r\\n]{{1,{MAX_BLOCKQUOTE_LINE_LENGTH}}}\\r?\\n){{1,{MAX_BLOCKQUOTE_LINES}}})"
CODE_BLOCK_PATTERN = f"(?:(?:^|\\r?\\n)```(?:\\w{{0,{MAX_CODE_LANGUAGE_LENGTH}}})?\\r?\\n[\\s\\S]{{0,{MAX_CODE_BLOCK_LENGTH}}}?```(?:\\r?\\n|$)|(?:(?:^|\\r?\\n)(?: |\\t)[^\\r\\n]{{0,{MAX_CODE_BLOCK_LENGTH}}}(?:\\r?\\n|$)){{1,{MAX_INDENTED_CODE_LINES}}})"
TABLE_PATTERN = f"(?:(?:^|\\r?\\n)(?:\\|[^\\r\\n]{{0,{MAX_TABLE_CELL_LENGTH}}}\\|(?:\\r?\\n\\|[-:]{{1,{MAX_TABLE_CELL_LENGTH}}}\\|){{0,1}}(?:\\r?\\n\\|[^\\r\\n]{{0,{MAX_TABLE_CELL_LENGTH}}}\\|){{0,{MAX_TABLE_ROWS}}}|<table>[\\s\\S]{{0,{MAX_HTML_TABLE_LENGTH}}}?</table>))"
PARAGRAPH_PATTERN = f"(?:(?:^|\\r?\\n\\r?\\n)(?![-*+>\\s]|\\d+\\.)[^\\r\\n]{{1,{MAX_PARAGRAPH_LENGTH}}}(?:\\r?\\n(?!\\r?\\n|[-*+>\\s]|\\d+\\.)[^\\r\\n]{{1,{MAX_PARAGRAPH_LENGTH}}})*)(?:\\r?\\n\\r?\\n|$)"
# Combine all patterns
MAIN_PATTERN = f"{HEADING_PATTERN}|{LIST_PATTERN}|{BLOCKQUOTE_PATTERN}|{CODE_BLOCK_PATTERN}|{TABLE_PATTERN}|{PARAGRAPH_PATTERN}"
def chunk_text(text: str) -> List[str]:
"""
Tokenize the input text into meaningful chunks using regex patterns.
Args:
text (str): Input text to tokenize
Returns:
List[str]: List of text chunks
"""
# Pre-process text to ensure consistent line endings
text = text.replace('\r\n', '\n').replace('\r', '\n')
# Find all matches
chunks = regex.findall(MAIN_PATTERN, text, regex.MULTILINE)
# Filter out empty matches and strip whitespace
chunks = [chunk.strip() for chunk in chunks if chunk.strip()]
return chunks
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment