Skip to content

Instantly share code, notes, and snippets.

@sqs
Created August 26, 2024 16:15
Show Gist options
  • Save sqs/65fbe1e230b921abcadd35f2bb1161e6 to your computer and use it in GitHub Desktop.
Save sqs/65fbe1e230b921abcadd35f2bb1161e6 to your computer and use it in GitHub Desktop.
import { describe, expect, test } from 'vitest'
import { lastValueFromAsyncGenerator } from '../common/asyncGenerator'
import { parseMessageXMLLike } from './structuredMessageParser'
describe('parseXmlTags', () => {
test('parses XML tags correctly', async () => {
const input = '<tag1>content1</tag1><tag2>content2</tag2><tag1>content3</tag1>'
async function* generateChunks() {
yield input
}
const result = await lastValueFromAsyncGenerator(
parseMessageXMLLike(generateChunks(), ['tag1', 'tag2'])
)
expect(result).toStrictEqual<typeof result>({
tag1: ['content1', 'content3'],
tag2: ['content2'],
})
})
test('respects initialbuffer', async () => {
async function* generateChunks() {
yield 'content1</tag1>'
}
const result = await lastValueFromAsyncGenerator(
parseMessageXMLLike(generateChunks(), ['tag1'], '<tag1>')
)
expect(result).toStrictEqual<typeof result>({
tag1: ['content1'],
})
})
test("handles '<' characters (simple)", async () => {
async function* generateChunks() {
yield '<tag1>1<2</tag1>'
}
const result = await lastValueFromAsyncGenerator(parseMessageXMLLike(generateChunks(), ['tag1']))
expect(result).toStrictEqual<typeof result>({
tag1: ['1<2'],
})
})
test("handles '<' characters", async () => {
async function* generateChunks() {
yield '<tag1>1<2</tag1><tag2>5>10</tag2>'
}
const result = await lastValueFromAsyncGenerator(
parseMessageXMLLike(generateChunks(), ['tag1', 'tag2'])
)
expect(result).toStrictEqual<typeof result>({
tag1: ['1<2'],
tag2: ['5>10'],
})
})
test('handles incomplete tag', async () => {
const input = '<tag1>content1'
async function* generateChunks() {
yield input
}
const result = await lastValueFromAsyncGenerator(parseMessageXMLLike(generateChunks(), ['tag1']))
expect(result).toStrictEqual<typeof result>({
tag1: ['content1'],
})
})
test('handles incomplete tags', async () => {
const input = '<tag1>content1</tag1><tag2>content2'
async function* generateChunks() {
yield input
}
const result = await lastValueFromAsyncGenerator(
parseMessageXMLLike(generateChunks(), ['tag1', 'tag2'])
)
expect(result).toStrictEqual<typeof result>({
tag1: ['content1'],
tag2: ['content2'],
})
})
test('ignores non-target tags', async () => {
const input = '<tag1>content1</tag1><ignored>ignored content</ignored><tag2>content2</tag2>'
async function* generateChunks() {
yield input
}
const result = await lastValueFromAsyncGenerator(
parseMessageXMLLike(generateChunks(), ['tag1', 'tag2'])
)
expect(result).toStrictEqual<typeof result>({
tag1: ['content1'],
tag2: ['content2'],
})
})
test('handles chunked input', async () => {
async function* generateChunks() {
const chunks = ['<tag1>con', 'tent1</tag1><t', 'ag2>content2</tag2>']
for (const chunk of chunks) {
yield chunk
}
}
const result = await lastValueFromAsyncGenerator(
parseMessageXMLLike(generateChunks(), ['tag1', 'tag2'])
)
expect(result).toStrictEqual<typeof result>({
tag1: ['content1'],
tag2: ['content2'],
})
})
test('emits partial', async () => {
async function* generateChunks() {
const chunks = ['<tag1>con', 'tent1</tag1><t', 'ag2>cont', 'ent2</tag2>']
for (const chunk of chunks) {
yield chunk
}
}
const emissions: Record<string, string[]>[] = []
for await (const value of parseMessageXMLLike(generateChunks(), ['tag1', 'tag2'])) {
emissions.push(JSON.parse(JSON.stringify(value)))
}
expect(emissions).toStrictEqual<typeof emissions>([
{
tag1: ['content1'],
tag2: [],
},
{
tag1: ['content1'],
tag2: ['content2'],
},
])
})
})
/**
* Parse the XML-like data in a response from an LLM.
*/
export async function* parseMessageXMLLike<T extends string[]>(
chunks: AsyncGenerator<string>,
targetTags: T,
initialBuffer?: string
): AsyncGenerator<Record<T[number], string[]>> {
const results = {} as Record<T[number], string[]>
for (const tag of targetTags) {
results[tag as T[number]] = []
}
function indexOfOpenTag(buffer: string): { tag: string; index: number; end: number } | null {
let nearest: ReturnType<typeof indexOfOpenTag> | null = null
for (const tag of targetTags) {
const s = `<${tag}>`
const index = buffer.indexOf(s)
if (index !== -1 && (nearest === null || index < nearest.index)) {
nearest = { tag, index, end: index + s.length }
}
}
return nearest
}
function indexOfCloseTag(buffer: string, tag: string): { index: number; end: number } | null {
const s = `</${tag}>`
const index = buffer.indexOf(s)
if (index !== -1) {
return { index, end: index + s.length }
}
return null
}
let buffer = ''
let currentTag: T[number] | null = null
for await (const chunk of chunks) {
if (initialBuffer !== undefined) {
buffer += initialBuffer
initialBuffer = undefined
}
buffer += chunk
console.log('XX', buffer)
while (buffer.length > 0) {
if (currentTag === null) {
const openTag = indexOfOpenTag(buffer)
if (!openTag) {
break
}
currentTag = openTag.tag
buffer = buffer.slice(openTag.end)
}
const closeTag = indexOfCloseTag(buffer, currentTag)
if (!closeTag) {
break
}
const tagContent = buffer.slice(0, closeTag.index)
results[currentTag].push(tagContent)
yield { ...results }
currentTag = null
buffer = buffer.slice(closeTag.end)
}
}
if (currentTag) {
results[currentTag].push(buffer)
yield { ...results }
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment