Created
August 26, 2024 16:15
-
-
Save sqs/65fbe1e230b921abcadd35f2bb1161e6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { describe, expect, test } from 'vitest' | |
import { lastValueFromAsyncGenerator } from '../common/asyncGenerator' | |
import { parseMessageXMLLike } from './structuredMessageParser' | |
describe('parseXmlTags', () => { | |
test('parses XML tags correctly', async () => { | |
const input = '<tag1>content1</tag1><tag2>content2</tag2><tag1>content3</tag1>' | |
async function* generateChunks() { | |
yield input | |
} | |
const result = await lastValueFromAsyncGenerator( | |
parseMessageXMLLike(generateChunks(), ['tag1', 'tag2']) | |
) | |
expect(result).toStrictEqual<typeof result>({ | |
tag1: ['content1', 'content3'], | |
tag2: ['content2'], | |
}) | |
}) | |
test('respects initialbuffer', async () => { | |
async function* generateChunks() { | |
yield 'content1</tag1>' | |
} | |
const result = await lastValueFromAsyncGenerator( | |
parseMessageXMLLike(generateChunks(), ['tag1'], '<tag1>') | |
) | |
expect(result).toStrictEqual<typeof result>({ | |
tag1: ['content1'], | |
}) | |
}) | |
test("handles '<' characters (simple)", async () => { | |
async function* generateChunks() { | |
yield '<tag1>1<2</tag1>' | |
} | |
const result = await lastValueFromAsyncGenerator(parseMessageXMLLike(generateChunks(), ['tag1'])) | |
expect(result).toStrictEqual<typeof result>({ | |
tag1: ['1<2'], | |
}) | |
}) | |
test("handles '<' characters", async () => { | |
async function* generateChunks() { | |
yield '<tag1>1<2</tag1><tag2>5>10</tag2>' | |
} | |
const result = await lastValueFromAsyncGenerator( | |
parseMessageXMLLike(generateChunks(), ['tag1', 'tag2']) | |
) | |
expect(result).toStrictEqual<typeof result>({ | |
tag1: ['1<2'], | |
tag2: ['5>10'], | |
}) | |
}) | |
test('handles incomplete tag', async () => { | |
const input = '<tag1>content1' | |
async function* generateChunks() { | |
yield input | |
} | |
const result = await lastValueFromAsyncGenerator(parseMessageXMLLike(generateChunks(), ['tag1'])) | |
expect(result).toStrictEqual<typeof result>({ | |
tag1: ['content1'], | |
}) | |
}) | |
test('handles incomplete tags', async () => { | |
const input = '<tag1>content1</tag1><tag2>content2' | |
async function* generateChunks() { | |
yield input | |
} | |
const result = await lastValueFromAsyncGenerator( | |
parseMessageXMLLike(generateChunks(), ['tag1', 'tag2']) | |
) | |
expect(result).toStrictEqual<typeof result>({ | |
tag1: ['content1'], | |
tag2: ['content2'], | |
}) | |
}) | |
test('ignores non-target tags', async () => { | |
const input = '<tag1>content1</tag1><ignored>ignored content</ignored><tag2>content2</tag2>' | |
async function* generateChunks() { | |
yield input | |
} | |
const result = await lastValueFromAsyncGenerator( | |
parseMessageXMLLike(generateChunks(), ['tag1', 'tag2']) | |
) | |
expect(result).toStrictEqual<typeof result>({ | |
tag1: ['content1'], | |
tag2: ['content2'], | |
}) | |
}) | |
test('handles chunked input', async () => { | |
async function* generateChunks() { | |
const chunks = ['<tag1>con', 'tent1</tag1><t', 'ag2>content2</tag2>'] | |
for (const chunk of chunks) { | |
yield chunk | |
} | |
} | |
const result = await lastValueFromAsyncGenerator( | |
parseMessageXMLLike(generateChunks(), ['tag1', 'tag2']) | |
) | |
expect(result).toStrictEqual<typeof result>({ | |
tag1: ['content1'], | |
tag2: ['content2'], | |
}) | |
}) | |
test('emits partial', async () => { | |
async function* generateChunks() { | |
const chunks = ['<tag1>con', 'tent1</tag1><t', 'ag2>cont', 'ent2</tag2>'] | |
for (const chunk of chunks) { | |
yield chunk | |
} | |
} | |
const emissions: Record<string, string[]>[] = [] | |
for await (const value of parseMessageXMLLike(generateChunks(), ['tag1', 'tag2'])) { | |
emissions.push(JSON.parse(JSON.stringify(value))) | |
} | |
expect(emissions).toStrictEqual<typeof emissions>([ | |
{ | |
tag1: ['content1'], | |
tag2: [], | |
}, | |
{ | |
tag1: ['content1'], | |
tag2: ['content2'], | |
}, | |
]) | |
}) | |
}) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Parse the XML-like data in a response from an LLM. | |
*/ | |
export async function* parseMessageXMLLike<T extends string[]>( | |
chunks: AsyncGenerator<string>, | |
targetTags: T, | |
initialBuffer?: string | |
): AsyncGenerator<Record<T[number], string[]>> { | |
const results = {} as Record<T[number], string[]> | |
for (const tag of targetTags) { | |
results[tag as T[number]] = [] | |
} | |
function indexOfOpenTag(buffer: string): { tag: string; index: number; end: number } | null { | |
let nearest: ReturnType<typeof indexOfOpenTag> | null = null | |
for (const tag of targetTags) { | |
const s = `<${tag}>` | |
const index = buffer.indexOf(s) | |
if (index !== -1 && (nearest === null || index < nearest.index)) { | |
nearest = { tag, index, end: index + s.length } | |
} | |
} | |
return nearest | |
} | |
function indexOfCloseTag(buffer: string, tag: string): { index: number; end: number } | null { | |
const s = `</${tag}>` | |
const index = buffer.indexOf(s) | |
if (index !== -1) { | |
return { index, end: index + s.length } | |
} | |
return null | |
} | |
let buffer = '' | |
let currentTag: T[number] | null = null | |
for await (const chunk of chunks) { | |
if (initialBuffer !== undefined) { | |
buffer += initialBuffer | |
initialBuffer = undefined | |
} | |
buffer += chunk | |
console.log('XX', buffer) | |
while (buffer.length > 0) { | |
if (currentTag === null) { | |
const openTag = indexOfOpenTag(buffer) | |
if (!openTag) { | |
break | |
} | |
currentTag = openTag.tag | |
buffer = buffer.slice(openTag.end) | |
} | |
const closeTag = indexOfCloseTag(buffer, currentTag) | |
if (!closeTag) { | |
break | |
} | |
const tagContent = buffer.slice(0, closeTag.index) | |
results[currentTag].push(tagContent) | |
yield { ...results } | |
currentTag = null | |
buffer = buffer.slice(closeTag.end) | |
} | |
} | |
if (currentTag) { | |
results[currentTag].push(buffer) | |
yield { ...results } | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment