-
-
Save jiyee/d9a9dd1544fd5c3063b825847212c887 to your computer and use it in GitHub Desktop.
Use regex to do chunking by using all semantic cues
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Used in https://jina.ai/tokenizer (Aug. 14th version) | |
// Define variables for magic numbers | |
const MAX_HEADING_LENGTH = 6; | |
const MAX_HEADING_CONTENT_LENGTH = 200; | |
const MAX_HEADING_UNDERLINE_LENGTH = 200; | |
const MAX_HTML_HEADING_ATTRIBUTES_LENGTH = 100; | |
const MAX_LIST_ITEM_LENGTH = 200; | |
const MAX_NESTED_LIST_ITEMS = 5; | |
const MAX_LIST_INDENT_SPACES = 7; | |
const MAX_BLOCKQUOTE_LINE_LENGTH = 200; | |
const MAX_BLOCKQUOTE_LINES = 10; | |
const MAX_CODE_BLOCK_LENGTH = 1000; | |
const MAX_CODE_LANGUAGE_LENGTH = 20; | |
const MAX_INDENTED_CODE_LINES = 20; | |
const MAX_TABLE_CELL_LENGTH = 200; | |
const MAX_TABLE_ROWS = 20; | |
const MAX_HTML_TABLE_LENGTH = 2000; | |
const MIN_HORIZONTAL_RULE_LENGTH = 3; | |
const MAX_SENTENCE_LENGTH = 300; | |
const MAX_QUOTED_TEXT_LENGTH = 300; | |
const MAX_PARENTHETICAL_CONTENT_LENGTH = 200; | |
const MAX_NESTED_PARENTHESES = 5; | |
const MAX_MATH_INLINE_LENGTH = 100; | |
const MAX_MATH_BLOCK_LENGTH = 500; | |
const MAX_PARAGRAPH_LENGTH = 1000; | |
const MAX_STANDALONE_LINE_LENGTH = 400; | |
const MAX_HTML_TAG_ATTRIBUTES_LENGTH = 100; | |
const MAX_HTML_TAG_CONTENT_LENGTH = 1000; | |
// Read the regex and test text from files | |
export const chunkRegex = new RegExp( | |
"(" + | |
// 1. Headings (Setext-style, Markdown, and HTML-style, with length constraints) | |
`(?:^(?:[#*=-]{1,${MAX_HEADING_LENGTH}}|\\w[^\\r\\n]{0,${MAX_HEADING_CONTENT_LENGTH}}\\r?\\n[-=]{2,${MAX_HEADING_UNDERLINE_LENGTH}}|<h[1-6][^>]{0,${MAX_HTML_HEADING_ATTRIBUTES_LENGTH}}>)[^\\r\\n]{1,${MAX_HEADING_CONTENT_LENGTH}}(?:</h[1-6]>)?(?:\\r?\\n|$))` + | |
"|" + | |
// 2. List items (bulleted, numbered, lettered, or task lists, including nested, up to three levels, with length constraints) | |
`(?:(?:^|\\r?\\n)[ \\t]{0,3}(?:[-*+•]|\\d{1,3}\\.\\w\\.|\\[[ xX]\\])[ \\t]+[^\\r\\n]{1,${MAX_LIST_ITEM_LENGTH}}` + | |
`(?:(?:\\r?\\n[ \\t]{2,5}(?:[-*+•]|\\d{1,3}\\.\\w\\.|\\[[ xX]\\])[ \\t]+[^\\r\\n]{1,${MAX_LIST_ITEM_LENGTH}}){0,${MAX_NESTED_LIST_ITEMS}}` + | |
`(?:\\r?\\n[ \\t]{4,${MAX_LIST_INDENT_SPACES}}(?:[-*+•]|\\d{1,3}\\.\\w\\.|\\[[ xX]\\])[ \\t]+[^\\r\\n]{1,${MAX_LIST_ITEM_LENGTH}}){0,${MAX_NESTED_LIST_ITEMS}})?)` + | |
"|" + | |
// 3. Block quotes (including nested quotes and citations, up to three levels, with length constraints) | |
`(?:(?:^>(?:>|\\s{2,}){0,2}[^\\r\\n]{0,${MAX_BLOCKQUOTE_LINE_LENGTH}}\\r?\\n?){1,${MAX_BLOCKQUOTE_LINES}})` + | |
"|" + | |
// 4. Code blocks (fenced, indented, or HTML pre/code tags, with length constraints) | |
`(?:(?:^|\\r?\\n)(?:\`\`\`|~~~)(?:\\w{0,${MAX_CODE_LANGUAGE_LENGTH}})?\\r?\\n[\\s\\S]{0,${MAX_CODE_BLOCK_LENGTH}}?(?:\`\`\`|~~~)\\r?\\n?` + | |
`|(?:(?:^|\\r?\\n)(?: {4}|\\t)[^\\r\\n]{0,${MAX_LIST_ITEM_LENGTH}}(?:\\r?\\n(?: {4}|\\t)[^\\r\\n]{0,${MAX_LIST_ITEM_LENGTH}}){0,${MAX_INDENTED_CODE_LINES}}\\r?\\n?)` + | |
`|(?:<pre>(?:<code>)?[\\s\\S]{0,${MAX_CODE_BLOCK_LENGTH}}?(?:</code>)?</pre>))` + | |
"|" + | |
// 5. Tables (Markdown, grid tables, and HTML tables, with length constraints) | |
`(?:(?:^|\\r?\\n)(?:\\|[^\\r\\n]{0,${MAX_TABLE_CELL_LENGTH}}\\|(?:\\r?\\n\\|[-:]{1,${MAX_TABLE_CELL_LENGTH}}\\|){0,1}(?:\\r?\\n\\|[^\\r\\n]{0,${MAX_TABLE_CELL_LENGTH}}\\|){0,${MAX_TABLE_ROWS}}` + | |
`|<table>[\\s\\S]{0,${MAX_HTML_TABLE_LENGTH}}?</table>))` + | |
"|" + | |
// 6. Horizontal rules (Markdown and HTML hr tag) | |
`(?:^(?:[-*_]){${MIN_HORIZONTAL_RULE_LENGTH},}\\s*$|<hr\\s*/?>)` + | |
"|" + | |
// 10. Standalone lines or phrases (including single-line blocks and HTML elements, with length constraints) | |
`(?:^(?:<[a-zA-Z][^>]{0,${MAX_HTML_TAG_ATTRIBUTES_LENGTH}}>)?[^\\r\\n]{1,${MAX_STANDALONE_LINE_LENGTH}}(?:</[a-zA-Z]+>)?(?:\\r?\\n|$))` + | |
"|" + | |
// 7. Sentences or phrases ending with punctuation (including ellipsis and Unicode punctuation) | |
`(?:[^\\r\\n]{1,${MAX_SENTENCE_LENGTH}}(?:[.!?…]|\\.{3}|[\\u2026\\u2047-\\u2049]|[\\p{Emoji_Presentation}\\p{Extended_Pictographic}])(?=(?=\\s)(?=\\s*[A-Z])|(?=\\r?\\n)|(?=\\Z)))` + | |
"|" + | |
// 8. Quoted text, parenthetical phrases, or bracketed content (with length constraints) | |
"(?:" + | |
`\"\"\"[^\\r\\n]{0,${MAX_QUOTED_TEXT_LENGTH}}\"\"\"` + | |
`|['\"\`'"][^\\r\\n]{0,${MAX_QUOTED_TEXT_LENGTH}}['\"\`'"]` + | |
`|\\([^\\r\\n()]{0,${MAX_PARENTHETICAL_CONTENT_LENGTH}}(?:\\([^\\r\\n()]{0,${MAX_PARENTHETICAL_CONTENT_LENGTH}}\\)[^\\r\\n()]{0,${MAX_PARENTHETICAL_CONTENT_LENGTH}}){0,${MAX_NESTED_PARENTHESES}}\\)` + | |
`|\\[[^\\r\\n\\[\\]]{0,${MAX_PARENTHETICAL_CONTENT_LENGTH}}(?:\\[[^\\r\\n\\[\\]]{0,${MAX_PARENTHETICAL_CONTENT_LENGTH}}\\][^\\r\\n\\[\\]]{0,${MAX_PARENTHETICAL_CONTENT_LENGTH}}){0,${MAX_NESTED_PARENTHESES}}\\]` + | |
`|\\$[^\\r\\n$]{0,${MAX_MATH_INLINE_LENGTH}}\\$` + | |
`|\`[^\`\\r\\n]{0,${MAX_MATH_INLINE_LENGTH}}\`` + | |
")" + | |
"|" + | |
// 9. Paragraphs (with length constraints) | |
`(?:(?<=\\r?\\n\\r?\\n|\\A)(?:<p>)?(?:(?!\\r?\\n\\r?\\n|\\z).){1,${MAX_PARAGRAPH_LENGTH}}(?:</p>)?(?=\\r?\\n\\r?\\n|\\z))` + | |
"|" + | |
// 11. HTML-like tags and their content (including self-closing tags and attributes, with length constraints) | |
`(?:<[a-zA-Z][^>]{0,${MAX_HTML_TAG_ATTRIBUTES_LENGTH}}(?:>[\\s\\S]{0,${MAX_HTML_TAG_CONTENT_LENGTH}}?</[a-zA-Z]+>|\\s*/>))` + | |
"|" + | |
// 12. LaTeX-style math expressions (inline and block, with length constraints) | |
`(?:(?:\\$\\$[\\s\\S]{0,${MAX_MATH_BLOCK_LENGTH}}?\\$\\$)|(?:\\$[^\\$\\r\\n]{0,${MAX_MATH_INLINE_LENGTH}}\\$))` + | |
"|" + | |
// 14. Fallback for any remaining content (with length constraints) | |
`(?:[^\\r\\n]{1,${MAX_STANDALONE_LINE_LENGTH}})` + | |
")", | |
"gm" | |
); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment