|
/** |
|
* This is my attempt at a solution. Send input text to ``parseInput`` |
|
* - Und3rf10w |
|
*/ |
|
|
|
export interface ParsedItem { |
|
type: 'code' | 'text'; |
|
content: string; |
|
language?: string; |
|
metadata?: { |
|
fileName?: string; |
|
highlightedLines?: string; |
|
}; |
|
} |
|
|
|
/** |
|
* A state-machine-based parser for extracting text and code blocks. |
|
* Handles nested code blocks and edge cases with fence lengths |
|
*/ |
|
export const parseInput = (inputText: string): ParsedItem[] => { |
|
if (!inputText) return []; |
|
|
|
const result: ParsedItem[] = []; |
|
const lines = inputText.split('\n'); |
|
|
|
let inCodeBlock = false; |
|
let currentBlockLines: string[] = []; |
|
let currentInfoString = ''; |
|
let currentOpenFenceLength = 0; |
|
|
|
for (let i = 0; i < lines.length; i++) { |
|
const line = lines[i]; |
|
|
|
if (inCodeBlock) { |
|
// We are INSIDE a code block, looking for a closing fence |
|
const trimmedLine = line.trim(); |
|
const fenceMatch = trimmedLine.match(/^(`{3,})$/); |
|
|
|
if (fenceMatch) { |
|
const fenceLength = fenceMatch[0].length; |
|
|
|
// Special case: if this fence equals the opening length, |
|
// look ahead for a longer fence (skipping blank lines) |
|
if (fenceLength === currentOpenFenceLength) { |
|
let foundLongerFence = false; |
|
let lookAheadIndex = i + 1; |
|
let blankLinesCount = 0; |
|
const maxBlankLines = 3; // a reasonable limit for new lines between fences |
|
|
|
while ( |
|
lookAheadIndex < lines.length && |
|
blankLinesCount <= maxBlankLines |
|
) { |
|
const lookAheadLine = lines[lookAheadIndex].trim(); |
|
|
|
if (lookAheadLine === '') { |
|
blankLinesCount++; |
|
lookAheadIndex++; |
|
continue; |
|
} |
|
|
|
const lookAheadFenceMatch = lookAheadLine.match(/^(`{3,})$/); |
|
if ( |
|
lookAheadFenceMatch && |
|
lookAheadFenceMatch[0].length > fenceLength |
|
) { |
|
foundLongerFence = true; |
|
break; |
|
} |
|
|
|
// If we hit non-blank, non-fence content, stop looking |
|
break; |
|
} |
|
|
|
if (foundLongerFence) { |
|
// The current fence is content, not a closer |
|
currentBlockLines.push(line); |
|
continue; |
|
} else { |
|
// No longer fence found, this IS the closing fence |
|
const codeContent = currentBlockLines.join('\n'); |
|
const { language, metadata } = parseInfoString(currentInfoString); |
|
result.push({ |
|
type: 'code', |
|
content: codeContent, |
|
language, |
|
metadata, |
|
}); |
|
|
|
// Reset state |
|
inCodeBlock = false; |
|
currentBlockLines = []; |
|
currentInfoString = ''; |
|
currentOpenFenceLength = 0; |
|
continue; |
|
} |
|
} |
|
|
|
// Handle fences that are longer than the opener |
|
if (fenceLength > currentOpenFenceLength) { |
|
// Exit the code block |
|
const codeContent = currentBlockLines.join('\n'); |
|
const { language, metadata } = parseInfoString(currentInfoString); |
|
result.push({ |
|
type: 'code', |
|
content: codeContent, |
|
language, |
|
metadata, |
|
}); |
|
|
|
// Reset state |
|
inCodeBlock = false; |
|
currentBlockLines = []; |
|
currentInfoString = ''; |
|
currentOpenFenceLength = 0; |
|
} else { |
|
// Fence is too short to close this block |
|
currentBlockLines.push(line); |
|
} |
|
} else { |
|
currentBlockLines.push(line); |
|
} |
|
} else { |
|
// We are OUTSIDE a code block, looking for an opening fence |
|
const trimmedLine = line.trim(); |
|
const fenceMatch = trimmedLine.match(/^(`{3,})/); |
|
|
|
if (fenceMatch) { |
|
// Enter a new code block |
|
if (currentBlockLines.length > 0) { |
|
result.push({ type: 'text', content: currentBlockLines.join('\n') }); |
|
} |
|
|
|
inCodeBlock = true; |
|
currentBlockLines = []; |
|
currentOpenFenceLength = fenceMatch[0].length; |
|
currentInfoString = trimmedLine |
|
.substring(currentOpenFenceLength) |
|
.trim(); |
|
} else { |
|
currentBlockLines.push(line); |
|
} |
|
} |
|
} |
|
|
|
// Handle any remaining content |
|
if (currentBlockLines.length > 0) { |
|
const remainingContent = currentBlockLines.join('\n'); |
|
if (inCodeBlock) { |
|
const { language, metadata } = parseInfoString(currentInfoString); |
|
result.push({ |
|
type: 'code', |
|
content: remainingContent, |
|
language, |
|
metadata, |
|
}); |
|
} else { |
|
result.push({ type: 'text', content: remainingContent }); |
|
} |
|
} |
|
|
|
return result; |
|
}; |
|
|
|
/** |
|
* Helper function to parse the info string of a code block. |
|
* Supports both quoted and unquoted filenames with spaces. |
|
* Only removes parentheses that look like legacy highlight syntax. |
|
*/ |
|
function parseInfoString(infoString: string) { |
|
const raw = infoString || ''; |
|
|
|
// Extract highlighted lines from braces (preferred syntax) |
|
const braceHighlightMatch = raw.match(/\{([^}]*)\}/); |
|
const highlightedLines = braceHighlightMatch |
|
? braceHighlightMatch[1].trim() || undefined |
|
: undefined; |
|
|
|
// Remove all brace blocks from further parsing |
|
let stripped = raw.replace(/\{[^}]*\}/g, ' ').trim(); |
|
|
|
// Only remove parentheses that look like legacy highlight syntax |
|
// (contain numbers, commas, and dashes only) |
|
// Really this was just for my specific usecase |
|
stripped = stripped |
|
.replace(/\(\s*\d+\s*(?:-\s*\d+)?(?:\s*,\s*\d+\s*(?:-\s*\d+)?)*\s*\)/g, ' ') |
|
.trim(); |
|
|
|
// Determine language as the first token (if not a key) |
|
let language = 'plaintext'; |
|
let remainder = stripped; |
|
|
|
const firstTokenMatch = remainder.match(/^([^\s=]+)/); |
|
if (firstTokenMatch) { |
|
const token = firstTokenMatch[1]; |
|
const after = remainder.slice(firstTokenMatch[0].length); |
|
const isKeyLike = /^\s*=/.test(after); |
|
|
|
if (!isKeyLike) { |
|
language = token.trim() || 'plaintext'; |
|
remainder = after.trim(); |
|
} |
|
} |
|
|
|
let fileName: string | undefined; |
|
|
|
// Parse key=value pairs with unquoted value handling |
|
while (remainder) { |
|
remainder = remainder.trim(); |
|
|
|
// Match key= pattern (case-insensitive keys) |
|
const keyEqMatch = remainder.match(/^([A-Za-z][A-Za-z0-9_-]*)\s*=\s*/i); |
|
if (!keyEqMatch) break; |
|
|
|
const keyNorm = keyEqMatch[1].toLowerCase(); |
|
remainder = remainder.slice(keyEqMatch[0].length); |
|
|
|
let val = ''; |
|
|
|
// Handle quoted values |
|
if (remainder.startsWith('"') || remainder.startsWith("'")) { |
|
const quote = remainder[0]; |
|
let j = 1; |
|
while (j < remainder.length) { |
|
const ch = remainder[j]; |
|
if (ch === '\\' && j + 1 < remainder.length) { |
|
val += remainder[j + 1]; |
|
j += 2; |
|
continue; |
|
} |
|
if (ch === quote) break; |
|
val += ch; |
|
j++; |
|
} |
|
remainder = remainder.slice(j + 1); |
|
} else { |
|
// Handle unquoted values - capture everything until we see another key= pattern |
|
let j = 0; |
|
let lastNonWhitespace = -1; |
|
|
|
while (j < remainder.length) { |
|
const ch = remainder[j]; |
|
|
|
// Track last non-whitespace position |
|
if (!/\s/.test(ch)) { |
|
lastNonWhitespace = j; |
|
} |
|
|
|
// Look ahead for another key=value pair |
|
if (j > 0 && /\s/.test(ch)) { |
|
const ahead = remainder.slice(j).trim(); |
|
if (/^[A-Za-z][A-Za-z0-9_-]*\s*=/i.test(ahead)) { |
|
// Found another key, stop here |
|
j = lastNonWhitespace + 1; |
|
break; |
|
} |
|
} |
|
|
|
j++; |
|
} |
|
|
|
val = remainder.slice(0, j).trim(); |
|
remainder = remainder.slice(j); |
|
} |
|
|
|
// Support both 'filename' and 'name' (case-insensitive) |
|
if (keyNorm === 'filename' || keyNorm === 'name') { |
|
fileName = val; |
|
} |
|
} |
|
|
|
console.debug('parseInfoString result:', { |
|
input: infoString, |
|
language, |
|
metadata: { fileName, highlightedLines }, |
|
}); |
|
|
|
return { language, metadata: { fileName, highlightedLines } }; |
|
} |