Last active
November 2, 2025 00:07
-
-
Save tarruda/09dcbc44c2be0cbc96a4b9809942d503 to your computer and use it in GitHub Desktop.
Test page for Qwen3-VL + llama.cpp server
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <!DOCTYPE html> | |
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8" /> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"/> | |
| <title>Image OCR and Bounding Box Drawer</title> | |
| <style> | |
| :root { | |
| --pad: 12px; | |
| --accent: #e53935; | |
| } | |
| body { | |
| font-family: Arial, sans-serif; | |
| margin: 20px; | |
| line-height: 1.4; | |
| } | |
| .controls { | |
| display: flex; | |
| gap: 12px; | |
| align-items: center; | |
| flex-wrap: wrap; | |
| margin-bottom: 10px; | |
| } | |
| .controls label { | |
| display: inline-flex; | |
| align-items: center; | |
| gap: 6px; | |
| font-size: 14px; | |
| } | |
| #apiUrl { | |
| padding: 6px 10px; | |
| font-size: 14px; | |
| border: 1px solid #ccc; | |
| border-radius: 4px; | |
| width: 400px; | |
| font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace; | |
| } | |
| #saveApiBtn { | |
| padding: 6px 12px; | |
| background: #4CAF50; | |
| color: white; | |
| border: none; | |
| border-radius: 4px; | |
| cursor: pointer; | |
| font-size: 14px; | |
| } | |
| #saveApiBtn:hover { | |
| background: #45a049; | |
| } | |
| .format-selector { | |
| margin: 10px 0; | |
| padding: 12px; | |
| background: #f5f5f5; | |
| border-radius: 6px; | |
| } | |
| .format-selector h3 { | |
| margin-top: 0; | |
| font-size: 16px; | |
| color: #333; | |
| } | |
| .format-option { | |
| margin: 8px 0; | |
| padding: 8px; | |
| background: white; | |
| border-radius: 4px; | |
| cursor: pointer; | |
| transition: background 0.2s; | |
| } | |
| .format-option:hover { | |
| background: #e8f4f8; | |
| } | |
| .format-option input[type="radio"] { | |
| margin-right: 8px; | |
| } | |
| .format-option label { | |
| cursor: pointer; | |
| display: block; | |
| } | |
| .format-description { | |
| margin-left: 24px; | |
| font-size: 12px; | |
| color: #666; | |
| margin-top: 4px; | |
| } | |
| .prompt-container { | |
| margin: 10px 0; | |
| } | |
| .prompt-container label { | |
| display: block; | |
| font-size: 14px; | |
| margin-bottom: 6px; | |
| font-weight: 600; | |
| } | |
| #promptTextarea { | |
| width: 100%; | |
| max-width: 800px; | |
| min-height: 100px; | |
| padding: 8px; | |
| font-size: 14px; | |
| font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace; | |
| border: 1px solid #ccc; | |
| border-radius: 4px; | |
| resize: vertical; | |
| } | |
| #submitBtn { | |
| padding: 8px 10px; | |
| font-size: 14px; | |
| cursor: pointer; | |
| } | |
| #canvas-container { | |
| position: relative; | |
| display: inline-block; | |
| margin-top: 12px; | |
| } | |
| #imageCanvas { | |
| border: 1px solid #ccc; | |
| display: block; | |
| } | |
| #overlayCanvas { | |
| position: absolute; | |
| left: 0; | |
| top: 0; | |
| pointer-events: auto; | |
| } | |
| #summary { | |
| margin-top: 20px; | |
| white-space: pre-wrap; | |
| } | |
| #progress { | |
| margin-top: 20px; | |
| white-space: pre-wrap; | |
| border: 1px solid #ddd; | |
| padding: 10px; | |
| max-height: 220px; | |
| overflow-y: auto; | |
| font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace; | |
| font-size: 12px; | |
| background: #fafafa; | |
| } | |
| .pill { | |
| display: inline-block; | |
| padding: 2px 6px; | |
| border-radius: 999px; | |
| background: #efefef; | |
| font-size: 12px; | |
| margin-left: 8px; | |
| } | |
| #tooltip { | |
| position: absolute; | |
| background: rgba(0, 0, 0, 0.85); | |
| color: white; | |
| padding: 6px 10px; | |
| border-radius: 4px; | |
| font-size: 13px; | |
| pointer-events: none; | |
| z-index: 1000; | |
| max-width: 300px; | |
| word-wrap: break-word; | |
| white-space: pre-wrap; | |
| display: none; | |
| } | |
| .preset-buttons { | |
| margin-top: 8px; | |
| display: flex; | |
| gap: 8px; | |
| flex-wrap: wrap; | |
| } | |
| .preset-btn { | |
| padding: 4px 10px; | |
| background: #f0f0f0; | |
| border: 1px solid #ccc; | |
| border-radius: 4px; | |
| cursor: pointer; | |
| font-size: 12px; | |
| } | |
| .preset-btn:hover { | |
| background: #e0e0e0; | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <h1>Upload Image for OCR and Bounding Boxes</h1> | |
| <div class="controls"> | |
| <input type="text" id="apiUrl" value="http://localhost:8080/v1/chat/completions" /> | |
| <button id="saveApiBtn">Save</button> | |
| <input type="file" id="imageUpload" accept="image/*"> | |
| <button id="submitBtn">Process Image</button> | |
| <label><input type="checkbox" id="toggleLabels"> Show labels</label> | |
| <span id="streamStatus" class="pill" hidden>streaming…</span> | |
| </div> | |
| <div class="format-selector"> | |
| <h3>Response Format</h3> | |
| <div class="format-option"> | |
| <label> | |
| <input type="radio" name="format" value="format1" checked> | |
| <strong>Format 1: Object-based</strong> | |
| </label> | |
| <div class="format-description"> | |
| Each bounding box is an object with bbox_2d array and text_content string | |
| </div> | |
| </div> | |
| <div class="format-option"> | |
| <label> | |
| <input type="radio" name="format" value="format2"> | |
| <strong>Format 2: Tuple-based</strong> | |
| </label> | |
| <div class="format-description"> | |
| Each bounding box is a 6-element array: [text_label, x_min, y_min, x_max, y_max, color_name] | |
| </div> | |
| </div> | |
| </div> | |
| <div class="prompt-container"> | |
| <label for="promptTextarea">Prompt:</label> | |
| <textarea id="promptTextarea">Spotting all the objects and all the text in the image with line-level, Output in JSON format as { "summary": "summary of the whole image", "bounding_boxes": [ { "bbox_2d": [x1,y1,x2,y2], "text_content": "text_or_description" } ] }.</textarea> | |
| </div> | |
| <div id="canvas-container"> | |
| <canvas id="imageCanvas"></canvas> | |
| <canvas id="overlayCanvas"></canvas> | |
| </div> | |
| <div id="tooltip"></div> | |
| <div id="summary"></div> | |
| <div id="progress">Idle. Upload an image to preview, then click "Process Image".</div> | |
| <script> | |
| // ----------------------- DOM references ----------------------- | |
| const fileInput = document.getElementById('imageUpload'); | |
| const submitBtn = document.getElementById('submitBtn'); | |
| const progressDiv = document.getElementById('progress'); | |
| const summaryDiv = document.getElementById('summary'); | |
| const imageCanvas = document.getElementById('imageCanvas'); | |
| const overlayCanvas = document.getElementById('overlayCanvas'); | |
| const overlayCtx = overlayCanvas.getContext('2d'); | |
| const toggleLabels = document.getElementById('toggleLabels'); | |
| const promptTextarea = document.getElementById('promptTextarea'); | |
| const streamStatusEl = document.getElementById('streamStatus'); | |
| const tooltip = document.getElementById('tooltip'); | |
| const apiUrlInput = document.getElementById('apiUrl'); | |
| const saveApiBtn = document.getElementById('saveApiBtn'); | |
| // ----------------------- State ----------------------- | |
| let uploadedDataUrl = null; | |
| let base64Image = null; | |
| let mimeType = null; | |
| let imgEl = new Image(); | |
| let interimBoxes = []; | |
| let finalBoxes = []; | |
| let rafPending = false; | |
| let streamingActive = false; | |
| let hoveredBox = null; | |
| let apiUrl = apiUrlInput.value; // Use default value | |
| // ----------------------- API Configuration ----------------------- | |
| // Load saved API URL from localStorage | |
| const savedApiUrl = localStorage.getItem('ocrApiUrl'); | |
| if (savedApiUrl) { | |
| apiUrlInput.value = savedApiUrl; | |
| apiUrl = savedApiUrl; | |
| } | |
| // Save API URL | |
| saveApiBtn.addEventListener('click', () => { | |
| const url = apiUrlInput.value.trim(); | |
| if (!url) { | |
| alert('Please enter an API URL'); | |
| return; | |
| } | |
| apiUrl = url; | |
| localStorage.setItem('ocrApiUrl', url); | |
| alert('API URL saved!'); | |
| }); | |
| // ----------------------- Prompt Templates ----------------------- | |
| const prompts = { | |
| format1: `Spotting all the text in the image with line-level, Output in JSON format as { "summary": "summary of the whole image", "bounding_boxes": [ { "bbox_2d": [x1,y1,x2,y2], "text_content": "text" } ] }.`, | |
| format2: `Perform a comprehensive analysis of the provided image to detect and extract all **text** objects present. Your task is to identify every instance of text objects within the image, regardless of its size and orientation. | |
| Bounding boxes: | |
| - text_label: The label of the bounding box | |
| - x_min: x-coordinate of the top-left corner (in pixels) | |
| - y_min: y-coordinate of the top-left corner (in pixels) | |
| - x_max: x-coordinate of the bottom-right corner (in pixels) | |
| - y_max: y-coordinate of the bottom-right corner (in pixels) | |
| - color_name: The color name | |
| JSON format: | |
| \`\`\`json | |
| { | |
| "has": <true|false>, | |
| "bounding_boxes": [ | |
| [ <text_label>, <x_min>, <y_min>, <x_max>, <y_max>, <color_name> ], | |
| ], | |
| "summary": "Summary of the image" | |
| } | |
| \`\`\`` | |
| }; | |
| // ----------------------- Format Selection ----------------------- | |
| const formatRadios = document.querySelectorAll('input[name="format"]'); | |
| formatRadios.forEach(radio => { | |
| radio.addEventListener('change', () => { | |
| const selectedFormat = radio.value; | |
| promptTextarea.value = prompts[selectedFormat]; | |
| }); | |
| }); | |
| function getSelectedFormat() { | |
| const selected = document.querySelector('input[name="format"]:checked'); | |
| return selected ? selected.value : 'format1'; | |
| } | |
| // ----------------------- JSON Schemas ----------------------- | |
| const jsonSchema1 = { | |
| name: 'image_description_with_boxes', | |
| strict: true, | |
| schema: { | |
| type: 'object', | |
| additionalProperties: false, | |
| properties: { | |
| bounding_boxes: { | |
| type: 'array', | |
| description: 'List of detected regions.', | |
| items: { | |
| type: 'object', | |
| additionalProperties: false, | |
| properties: { | |
| bbox_2d: { | |
| type: 'array', | |
| prefixItems: [ | |
| { type: 'number', description: 'x1' }, | |
| { type: 'number', description: 'y1' }, | |
| { type: 'number', description: 'x2' }, | |
| { type: 'number', description: 'y2' } | |
| ], | |
| minItems: 4, | |
| maxItems: 4 | |
| }, | |
| text_content: { | |
| type: 'string', | |
| description: 'either text content or a description of the object delimited by the bounding box' | |
| } | |
| }, | |
| required: ['bbox_2d', 'text_content'] | |
| } | |
| }, | |
| summary: { type: 'string', description: 'Summary of the image.' }, | |
| }, | |
| required: ['bounding_boxes', 'summary'] | |
| } | |
| }; | |
| const jsonSchema2 = { | |
| name: 'image_regions_v1', | |
| strict: true, | |
| schema: { | |
| type: 'object', | |
| additionalProperties: false, | |
| properties: { | |
| has: { type: 'boolean', description: 'Whether the target object(s) are present.' }, | |
| bounding_boxes: { | |
| type: 'array', | |
| description: 'List of detected regions as fixed-length tuples.', | |
| items: { | |
| type: 'array', | |
| prefixItems: [ | |
| { type: 'string', description: 'text_label' }, | |
| { type: 'number', description: 'x_min' }, | |
| { type: 'number', description: 'y_min' }, | |
| { type: 'number', description: 'x_max' }, | |
| { type: 'number', description: 'y_max' }, | |
| { type: 'string', description: 'color_name' } | |
| ], | |
| minItems: 6, | |
| maxItems: 6 | |
| } | |
| }, | |
| summary: { type: 'string', description: 'Summary of the image.' } | |
| }, | |
| required: ['has', 'bounding_boxes', 'summary'] | |
| } | |
| }; | |
| // ----------------------- Helpers: canvas setup & drawing ----------------------- | |
| function setupCanvases(w, h) { | |
| [imageCanvas, overlayCanvas].forEach(c => { | |
| c.width = w; | |
| c.height = h; | |
| c.style.width = w + 'px'; | |
| c.style.height = h + 'px'; | |
| }); | |
| clearOverlay(); | |
| } | |
| function drawBaseImage() { | |
| const ictx = imageCanvas.getContext('2d'); | |
| ictx.clearRect(0, 0, imageCanvas.width, imageCanvas.height); | |
| ictx.drawImage(imgEl, 0, 0, imageCanvas.width, imageCanvas.height); | |
| } | |
| function clearOverlay() { | |
| overlayCtx.clearRect(0, 0, overlayCanvas.width, overlayCanvas.height); | |
| } | |
| function labelsEnabled() { | |
| return toggleLabels.checked; | |
| } | |
| function scaleBbox(bbox, W, H) { | |
| let [x1, y1, x2, y2] = bbox.map(Number); | |
| const maxVal = Math.max(Math.abs(x1), Math.abs(y1), Math.abs(x2), Math.abs(y2)); | |
| if (maxVal <= 1 + 1e-6) { | |
| x1 *= W; y1 *= H; x2 *= W; y2 *= H; | |
| } else if (maxVal <= 1000 + 1e-6) { | |
| x1 = x1 / 1000 * W; y1 = y1 / 1000 * H; x2 = x2 / 1000 * W; y2 = y2 / 1000 * H; | |
| } | |
| const sx = Math.min(x1, x2), ex = Math.max(x1, x2); | |
| const sy = Math.min(y1, y2), ey = Math.max(y1, y2); | |
| return [ | |
| Math.max(0, Math.min(W, sx)), | |
| Math.max(0, Math.min(H, sy)), | |
| Math.max(0, Math.min(W, ex)), | |
| Math.max(0, Math.min(H, ey)) | |
| ]; | |
| } | |
| function drawBoxes(ctx, boxes, showLabels) { | |
| const W = imageCanvas.width; | |
| const H = imageCanvas.height; | |
| ctx.save(); | |
| ctx.lineWidth = 2; | |
| ctx.font = '12px Arial'; | |
| boxes.forEach(b => { | |
| if (!b || !Array.isArray(b.bbox_2d)) return; | |
| const [x1, y1, x2, y2] = scaleBbox(b.bbox_2d, W, H); | |
| const w = Math.max(0, x2 - x1); | |
| const h = Math.max(0, y2 - y1); | |
| if (w <= 0 || h <= 0) return; | |
| if (hoveredBox === b) { | |
| ctx.strokeStyle = '#ff6b6b'; | |
| ctx.lineWidth = 3; | |
| } else { | |
| ctx.strokeStyle = '#ff0000'; | |
| ctx.lineWidth = 2; | |
| } | |
| ctx.strokeRect(x1, y1, w, h); | |
| if (showLabels && typeof b.text_content === 'string' && b.text_content.trim()) { | |
| const label = b.text_content.trim(); | |
| const padX = 4, padY = 3; | |
| const textW = ctx.measureText(label).width; | |
| const textH = 12; | |
| const bx = x1, by = Math.max(0, y1 - textH - 6); | |
| ctx.fillStyle = 'rgba(255,0,0,0.18)'; | |
| ctx.fillRect(bx, by, textW + padX * 2, textH + padY * 2); | |
| ctx.fillStyle = '#ff0000'; | |
| ctx.fillText(label, bx + padX, by + textH + 1); | |
| } | |
| }); | |
| ctx.restore(); | |
| } | |
| function scheduleOverlayRender() { | |
| if (rafPending) return; | |
| rafPending = true; | |
| requestAnimationFrame(() => { | |
| rafPending = false; | |
| clearOverlay(); | |
| const boxes = (streamingActive ? interimBoxes : finalBoxes); | |
| drawBoxes(overlayCtx, boxes, labelsEnabled()); | |
| }); | |
| } | |
| // ----------------------- Hover detection ----------------------- | |
| function getBoxAtPoint(x, y) { | |
| const W = imageCanvas.width; | |
| const H = imageCanvas.height; | |
| const boxes = (streamingActive ? interimBoxes : finalBoxes); | |
| for (let i = boxes.length - 1; i >= 0; i--) { | |
| const b = boxes[i]; | |
| if (!b || !Array.isArray(b.bbox_2d)) continue; | |
| const [x1, y1, x2, y2] = scaleBbox(b.bbox_2d, W, H); | |
| if (x >= x1 && x <= x2 && y >= y1 && y <= y2) { | |
| return b; | |
| } | |
| } | |
| return null; | |
| } | |
| overlayCanvas.addEventListener('mousemove', (e) => { | |
| const rect = overlayCanvas.getBoundingClientRect(); | |
| const x = (e.clientX - rect.left) * (overlayCanvas.width / rect.width); | |
| const y = (e.clientY - rect.top) * (overlayCanvas.height / rect.height); | |
| const box = getBoxAtPoint(x, y); | |
| if (box !== hoveredBox) { | |
| hoveredBox = box; | |
| scheduleOverlayRender(); | |
| } | |
| if (box && box.text_content) { | |
| tooltip.textContent = box.text_content; | |
| tooltip.style.display = 'block'; | |
| tooltip.style.left = (e.pageX + 10) + 'px'; | |
| tooltip.style.top = (e.pageY - 30) + 'px'; | |
| overlayCanvas.style.cursor = 'pointer'; | |
| } else { | |
| tooltip.style.display = 'none'; | |
| overlayCanvas.style.cursor = 'default'; | |
| } | |
| }); | |
| overlayCanvas.addEventListener('mouseleave', () => { | |
| hoveredBox = null; | |
| tooltip.style.display = 'none'; | |
| scheduleOverlayRender(); | |
| }); | |
| // ----------------------- Helpers: streaming JSON parsing ----------------------- | |
| function extractFinalJsonStringFrom(accumulated) { | |
| const fenceStart = accumulated.indexOf('```json'); | |
| if (fenceStart !== -1) { | |
| const afterFence = accumulated.slice(fenceStart + 7); | |
| const fenceEnd = afterFence.indexOf('```'); | |
| if (fenceEnd !== -1) { | |
| return afterFence.slice(0, fenceEnd).trim(); | |
| } | |
| } | |
| const firstBrace = accumulated.indexOf('{'); | |
| if (firstBrace === -1) return null; | |
| let depth = 0, inStr = false, esc = false, endIndex = -1; | |
| for (let i = firstBrace; i < accumulated.length; i++) { | |
| const ch = accumulated[i]; | |
| if (inStr) { | |
| if (esc) { esc = false; } | |
| else if (ch === '\\') { esc = true; } | |
| else if (ch === '"') { inStr = false; } | |
| } else { | |
| if (ch === '"') inStr = true; | |
| else if (ch === '{') depth++; | |
| else if (ch === '}') { | |
| depth--; | |
| if (depth === 0) { endIndex = i; break; } | |
| } | |
| } | |
| } | |
| if (endIndex !== -1) { | |
| return accumulated.slice(firstBrace, endIndex + 1).trim(); | |
| } | |
| return null; | |
| } | |
| function extractPartialBoxes(accumulated, format) { | |
| if (format === 'format2') { | |
| return extractPartialBoxesFormat2(accumulated); | |
| } | |
| return extractPartialBoxesFormat1(accumulated); | |
| } | |
| function extractPartialBoxesFormat1(accumulated) { | |
| const keyIdx = accumulated.indexOf('"bounding_boxes"'); | |
| if (keyIdx === -1) return []; | |
| const startBracket = accumulated.indexOf('[', keyIdx); | |
| if (startBracket === -1) return []; | |
| const s = accumulated.slice(startBracket + 1); | |
| const boxes = []; | |
| let inStr = false, esc = false, braceDepth = 0, objStart = -1; | |
| for (let i = 0; i < s.length; i++) { | |
| const ch = s[i]; | |
| if (inStr) { | |
| if (esc) { esc = false; } | |
| else if (ch === '\\') { esc = true; } | |
| else if (ch === '"') { inStr = false; } | |
| continue; | |
| } | |
| if (ch === '"') { inStr = true; continue; } | |
| if (ch === '{') { | |
| if (braceDepth === 0) objStart = i; | |
| braceDepth++; | |
| } else if (ch === '}') { | |
| braceDepth--; | |
| if (braceDepth === 0 && objStart !== -1) { | |
| const objStr = s.slice(objStart, i + 1); | |
| try { | |
| const obj = JSON.parse(objStr); | |
| if (obj && Array.isArray(obj.bbox_2d) && 'text_content' in obj) { | |
| boxes.push(obj); | |
| } | |
| } catch { | |
| // ignore incomplete objects | |
| } | |
| objStart = -1; | |
| } | |
| } else if (ch === ']' && braceDepth === 0) { | |
| break; | |
| } | |
| } | |
| return dedupeBoxes(boxes); | |
| } | |
| function extractPartialBoxesFormat2(accumulated) { | |
| const keyIdx = accumulated.indexOf('"bounding_boxes"'); | |
| if (keyIdx === -1) return []; | |
| const startBracket = accumulated.indexOf('[', keyIdx); | |
| if (startBracket === -1) return []; | |
| const s = accumulated.slice(startBracket + 1); | |
| const boxes = []; | |
| let inStr = false, esc = false, bracketDepth = 0, arrayStart = -1; | |
| for (let i = 0; i < s.length; i++) { | |
| const ch = s[i]; | |
| if (inStr) { | |
| if (esc) { esc = false; } | |
| else if (ch === '\\') { esc = true; } | |
| else if (ch === '"') { inStr = false; } | |
| continue; | |
| } | |
| if (ch === '"') { inStr = true; continue; } | |
| if (ch === '[') { | |
| if (bracketDepth === 0) arrayStart = i; | |
| bracketDepth++; | |
| } else if (ch === ']') { | |
| bracketDepth--; | |
| if (bracketDepth === 0 && arrayStart !== -1) { | |
| const arrStr = s.slice(arrayStart, i + 1); | |
| try { | |
| const arr = JSON.parse(arrStr); | |
| if (Array.isArray(arr) && arr.length >= 4) { | |
| // Convert format2 tuple to format1 object | |
| const box = { | |
| bbox_2d: [arr[1], arr[2], arr[3], arr[4]], | |
| text_content: arr[0] | |
| }; | |
| boxes.push(box); | |
| } | |
| } catch { | |
| // ignore incomplete arrays | |
| } | |
| arrayStart = -1; | |
| } else if (bracketDepth === -1) { | |
| break; // End of outer array | |
| } | |
| } | |
| } | |
| return dedupeBoxes(boxes); | |
| } | |
| function dedupeBoxes(boxes) { | |
| const seen = new Set(); | |
| const out = []; | |
| for (const b of boxes) { | |
| const key = JSON.stringify([b.text_content, ...(b.bbox_2d || [])]); | |
| if (!seen.has(key)) { seen.add(key); out.push(b); } | |
| } | |
| return out; | |
| } | |
| function convertFormat2ToFormat1(result) { | |
| if (!result.bounding_boxes || !Array.isArray(result.bounding_boxes)) { | |
| return result; | |
| } | |
| // Convert tuple format to object format | |
| const convertedBoxes = result.bounding_boxes.map(tuple => { | |
| if (Array.isArray(tuple) && tuple.length >= 5) { | |
| return { | |
| bbox_2d: [tuple[1], tuple[2], tuple[3], tuple[4]], | |
| text_content: tuple[0] | |
| }; | |
| } | |
| return tuple; // Return as-is if not in expected format | |
| }); | |
| return { | |
| ...result, | |
| bounding_boxes: convertedBoxes | |
| }; | |
| } | |
| // ----------------------- Image upload ----------------------- | |
| fileInput.addEventListener('change', () => { | |
| const file = fileInput.files[0]; | |
| if (!file) return; | |
| const reader = new FileReader(); | |
| reader.onload = e => { | |
| uploadedDataUrl = e.target.result; | |
| const parts = uploadedDataUrl.split(','); | |
| mimeType = parts[0].split(':')[1].split(';')[0]; | |
| base64Image = parts[1]; | |
| imgEl = new Image(); | |
| imgEl.onload = () => { | |
| setupCanvases(imgEl.naturalWidth, imgEl.naturalHeight); | |
| drawBaseImage(); | |
| interimBoxes = []; | |
| finalBoxes = []; | |
| streamingActive = false; | |
| hoveredBox = null; | |
| progressDiv.textContent = 'Image loaded. Click "Process Image" to start.'; | |
| summaryDiv.textContent = ''; | |
| scheduleOverlayRender(); | |
| }; | |
| imgEl.src = uploadedDataUrl; | |
| }; | |
| reader.readAsDataURL(file); | |
| }); | |
| toggleLabels.addEventListener('change', scheduleOverlayRender); | |
| // ----------------------- Process Image (streaming) ----------------------- | |
| submitBtn.addEventListener('click', async () => { | |
| if (!apiUrl) { | |
| alert('Please configure the API URL first.'); | |
| return; | |
| } | |
| clearOverlay(); | |
| const file = fileInput.files[0]; | |
| if (!file) { alert('Please upload an image.'); return; } | |
| if (!uploadedDataUrl || !base64Image || !mimeType) { | |
| const dataUrl = await new Promise((resolve, reject) => { | |
| const r = new FileReader(); | |
| r.onload = e => resolve(e.target.result); | |
| r.onerror = reject; | |
| r.readAsDataURL(file); | |
| }); | |
| uploadedDataUrl = dataUrl; | |
| const parts = uploadedDataUrl.split(','); | |
| mimeType = parts[0].split(':')[1].split(';')[0]; | |
| base64Image = parts[1]; | |
| if (!imgEl.src) { | |
| imgEl = new Image(); | |
| imgEl.onload = () => { | |
| setupCanvases(imgEl.naturalWidth, imgEl.naturalHeight); | |
| drawBaseImage(); | |
| }; | |
| imgEl.src = uploadedDataUrl; | |
| } | |
| } | |
| const prompt = promptTextarea.value.trim(); | |
| const selectedFormat = getSelectedFormat(); | |
| const jsonSchema = selectedFormat === 'format1' ? jsonSchema1 : jsonSchema2; | |
| progressDiv.textContent = `Uploading and processing (streaming with ${selectedFormat})…`; | |
| streamStatusEl.hidden = false; | |
| streamingActive = true; | |
| interimBoxes = []; | |
| finalBoxes = []; | |
| hoveredBox = null; | |
| const payload = { | |
| model: 'gpt-4o', | |
| messages: [ | |
| { | |
| role: 'user', | |
| content: [ | |
| { type: 'text', text: prompt }, | |
| { type: 'image_url', image_url: { url: `data:${mimeType};base64,${base64Image}`, detail: 'high' } } | |
| ] | |
| } | |
| ], | |
| max_tokens: 24000, | |
| temperature: 0, | |
| stream: true, | |
| response_format: { | |
| type: "json_schema", | |
| json_schema: jsonSchema | |
| } | |
| }; | |
| try { | |
| const resp = await fetch(apiUrl, { | |
| method: 'POST', | |
| headers: { 'Content-Type': 'application/json' }, | |
| body: JSON.stringify(payload) | |
| }); | |
| if (!resp.ok) throw new Error(`API request failed (${resp.status})`); | |
| const reader = resp.body.getReader(); | |
| const decoder = new TextDecoder(); | |
| let accumulated = ''; | |
| let isDone = false; | |
| let totalParsedInterim = 0; | |
| while (!isDone) { | |
| const { done, value } = await reader.read(); | |
| isDone = done; | |
| if (value) { | |
| const chunk = decoder.decode(value); | |
| const lines = chunk.split('\n'); | |
| for (const line of lines) { | |
| if (!line.startsWith('data: ')) continue; | |
| const data = line.slice(6); | |
| if (data === '[DONE]') { isDone = true; break; } | |
| try { | |
| const parsed = JSON.parse(data); | |
| const delta = parsed?.choices?.[0]?.delta?.content; | |
| if (typeof delta === 'string') { | |
| accumulated += delta; | |
| progressDiv.textContent = accumulated.slice(-4000); | |
| const partial = extractPartialBoxes(accumulated, selectedFormat); | |
| if (partial.length !== totalParsedInterim) { | |
| interimBoxes = partial; | |
| totalParsedInterim = partial.length; | |
| scheduleOverlayRender(); | |
| } | |
| } | |
| if (parsed?.choices?.[0]?.finish_reason) { | |
| isDone = true; | |
| } | |
| } catch (e) { | |
| // Ignore non-JSON SSE lines | |
| } | |
| } | |
| } | |
| } | |
| streamStatusEl.hidden = true; | |
| progressDiv.textContent += '\n\nProcessing complete. Parsing final JSON…'; | |
| const jsonStr = extractFinalJsonStringFrom(accumulated); | |
| if (!jsonStr) throw new Error('Could not extract a complete JSON object from the stream.'); | |
| let result; | |
| try { | |
| result = JSON.parse(jsonStr); | |
| } catch (e) { | |
| throw new Error('Final JSON parse failed.'); | |
| } | |
| // Convert format2 to format1 for consistent handling | |
| if (selectedFormat === 'format2') { | |
| result = convertFormat2ToFormat1(result); | |
| } | |
| summaryDiv.textContent = result.summary || 'No summary provided.'; | |
| finalBoxes = Array.isArray(result.bounding_boxes) ? result.bounding_boxes : []; | |
| streamingActive = false; | |
| hoveredBox = null; | |
| scheduleOverlayRender(); | |
| progressDiv.textContent += `\nRendering complete. Final boxes: ${finalBoxes.length}.`; | |
| } catch (error) { | |
| console.error(error); | |
| streamStatusEl.hidden = true; | |
| streamingActive = false; | |
| progressDiv.textContent = 'Failed to process image: ' + error.message; | |
| } | |
| }); | |
| </script> | |
| </body> | |
| </html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment