Last active
July 6, 2025 17:30
-
-
Save sterlingsky/cb90eb48c8e792a38e363087da48f8be to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Enhanced Screaming Frog Recipe: Semantic Chunking + Open Source Embeddings | |
// | |
// This script breaks webpage content into semantic passages (paragraphs, headings, | |
// list items, etc.) and embeds each chunk using sentence-transformers/all-MiniLM-L6-v2, | |
// a high-performing open-source model that ranks well on MTEB benchmarks. | |
// | |
// SETUP REQUIRED: | |
// 1. Get a free Hugging Face token from: https://huggingface.co/settings/tokens | |
// 2. Replace 'your_hf_token_here' below with your actual token | |
// 3. The token will be stored in your SEO Spider configuration - be mindful when sharing | |
// | |
// MODEL INFO: | |
// - Model: sentence-transformers/all-MiniLM-L6-v2 (384 dimensions) | |
// - MTEB Performance: Strong performer among lightweight models (69.6% on MTEB) | |
// - Cost: Free via Hugging Face Inference API (with rate limits) | |
// - Context Length: Up to 512 tokens | |
// | |
// Uses the official Hugging Face Inference JS client | |
// | |
// Import the HfInference class from the CDN version of the package | |
// This is necessary since we're running in a browser environment without module bundling | |
const HF_TOKEN = 'paste in your HuggingFace token here'; | |
const MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'; | |
// Create a class to mimic the HfInference client functionality using the correct router API endpoint | |
class HfInferenceClient { | |
constructor(token) { | |
this.token = token; | |
} | |
async featureExtraction(options) { | |
// For feature extraction (embeddings), we need to use the feature-extraction pipeline | |
// We'll use the direct endpoint for feature-extraction as shown in the HF examples | |
const url = `https://router.huggingface.co/hf-inference/models/${options.model}/pipeline/feature-extraction`; | |
// For sentence-transformers models, we need to send each text as a separate request | |
if (Array.isArray(options.inputs)) { | |
// Process each input separately and return an array of embeddings | |
const promises = options.inputs.map(text => this.getSingleEmbedding(url, text)); | |
return Promise.all(promises); | |
} else { | |
// Single input | |
return this.getSingleEmbedding(url, options.inputs); | |
} | |
} | |
async getSingleEmbedding(url, text) { | |
// Using the correct request format for feature-extraction | |
const response = await fetch(url, { | |
method: 'POST', | |
headers: { | |
Authorization: `Bearer ${this.token}`, | |
'Content-Type': 'application/json', | |
}, | |
body: JSON.stringify({ | |
inputs: text, | |
}), | |
}); | |
if (!response.ok) { | |
const errorText = await response.text(); | |
throw new Error(`Hugging Face API error (${response.status}): ${errorText}`); | |
} | |
return response.json(); | |
} | |
} | |
// Initialize the client | |
const hfClient = new HfInferenceClient(HF_TOKEN); | |
// Configuration | |
const CONFIG = { | |
minChunkLength: 50, // Minimum characters per chunk | |
maxChunkLength: 500, // Maximum characters per chunk (reduced for smaller model) | |
includeMetadata: true, // Include element type and position info | |
batchSize: 5, // Number of chunks to process in parallel | |
retryAttempts: 3, // Retry failed requests | |
retryDelay: 1000, // Delay between retries (ms) | |
}; | |
/** | |
* Extract semantic passages from the webpage DOM | |
* @returns {Array} Array of chunk objects with text and metadata | |
*/ | |
function extractSemanticChunks() { | |
const chunks = []; | |
let chunkIndex = 0; | |
// Define semantic elements to extract | |
const semanticSelectors = [ | |
'h1, h2, h3, h4, h5, h6', // Headings | |
'p', // Paragraphs | |
'li', // List items | |
'blockquote', // Quotes | |
'article', // Articles | |
'section', // Sections | |
'div[role="main"]', // Main content areas | |
'td, th', // Table cells | |
'figcaption', // Figure captions | |
'summary', // Summary elements | |
'dd', // Definition descriptions | |
]; | |
semanticSelectors.forEach((selector) => { | |
const elements = document.querySelectorAll(selector); | |
elements.forEach((element, index) => { | |
const text = element.textContent?.trim(); | |
if (text && text.length >= CONFIG.minChunkLength) { | |
// Split long chunks while preserving sentence boundaries | |
const textChunks = splitLongText(text, CONFIG.maxChunkLength); | |
textChunks.forEach((chunkText, subIndex) => { | |
const chunk = { | |
text: chunkText, | |
index: chunkIndex++, | |
metadata: CONFIG.includeMetadata | |
? { | |
elementType: element.tagName.toLowerCase(), | |
elementIndex: index, | |
subChunkIndex: subIndex, | |
totalSubChunks: textChunks.length, | |
xpath: getXPath(element), | |
textLength: chunkText.length, | |
} | |
: null, | |
}; | |
chunks.push(chunk); | |
}); | |
} | |
}); | |
}); | |
// Fallback: if no semantic elements found, chunk the body text | |
if (chunks.length === 0) { | |
const bodyText = document.body.textContent?.trim(); | |
if (bodyText) { | |
const textChunks = splitLongText(bodyText, CONFIG.maxChunkLength); | |
textChunks.forEach((chunkText, index) => { | |
chunks.push({ | |
text: chunkText, | |
index: index, | |
metadata: CONFIG.includeMetadata | |
? { | |
elementType: 'body', | |
elementIndex: 0, | |
subChunkIndex: index, | |
totalSubChunks: textChunks.length, | |
xpath: '/html/body', | |
textLength: chunkText.length, | |
} | |
: null, | |
}); | |
}); | |
} | |
} | |
return chunks; | |
} | |
/** | |
* Split long text into smaller chunks while preserving sentence boundaries | |
* @param {string} text - Text to split | |
* @param {number} maxLength - Maximum length per chunk | |
* @returns {Array} Array of text chunks | |
*/ | |
function splitLongText(text, maxLength) { | |
if (text.length <= maxLength) { | |
return [text]; | |
} | |
const chunks = []; | |
const sentences = text.split(/(?<=[.!?])\s+/); | |
let currentChunk = ''; | |
for (const sentence of sentences) { | |
if ((currentChunk + sentence).length <= maxLength) { | |
currentChunk += (currentChunk ? ' ' : '') + sentence; | |
} else { | |
if (currentChunk) { | |
chunks.push(currentChunk); | |
currentChunk = sentence; | |
} else { | |
// Handle very long sentences by splitting on word boundaries | |
const words = sentence.split(' '); | |
let wordChunk = ''; | |
for (const word of words) { | |
if ((wordChunk + word).length <= maxLength) { | |
wordChunk += (wordChunk ? ' ' : '') + word; | |
} else { | |
if (wordChunk) chunks.push(wordChunk); | |
wordChunk = word; | |
} | |
} | |
if (wordChunk) currentChunk = wordChunk; | |
} | |
} | |
} | |
if (currentChunk) { | |
chunks.push(currentChunk); | |
} | |
return chunks.filter((chunk) => chunk.length >= CONFIG.minChunkLength); | |
} | |
/** | |
* Get XPath for an element | |
* @param {Element} element - DOM element | |
* @returns {string} XPath string | |
*/ | |
function getXPath(element) { | |
if (element.id) { | |
return `//*[@id="${element.id}"]`; | |
} | |
const parts = []; | |
while (element && element.nodeType === Node.ELEMENT_NODE) { | |
let index = 0; | |
let sibling = element.previousSibling; | |
while (sibling) { | |
if (sibling.nodeType === Node.ELEMENT_NODE && sibling.tagName === element.tagName) { | |
index++; | |
} | |
sibling = sibling.previousSibling; | |
} | |
const tagName = element.tagName.toLowerCase(); | |
const pathIndex = index > 0 ? `[${index + 1}]` : ''; | |
parts.unshift(`${tagName}${pathIndex}`); | |
element = element.parentNode; | |
} | |
return parts.length ? `/${parts.join('/')}` : ''; | |
} | |
/** | |
* Get embeddings for an array of texts using the HF client | |
*/ | |
async function getEmbeddings(texts) { | |
try { | |
return await hfClient.featureExtraction({ | |
model: MODEL_NAME, | |
inputs: texts, | |
options: { | |
wait_for_model: true, | |
use_cache: true | |
} | |
}); | |
} catch (error) { | |
throw new Error(`Hugging Face API error: ${error.message}`); | |
} | |
} | |
/** | |
* Process chunks in batches with retry logic | |
* @param {Array} chunks - Array of chunk objects | |
* @returns {Promise} Promise resolving to embedded chunks | |
*/ | |
async function processChunksWithRetry(chunks) { | |
const results = []; | |
// Process in batches | |
for (let i = 0; i < chunks.length; i += CONFIG.batchSize) { | |
const batch = chunks.slice(i, i + CONFIG.batchSize); | |
const texts = batch.map((chunk) => chunk.text); | |
let attempt = 0; | |
let embeddings = null; | |
while (attempt < CONFIG.retryAttempts && !embeddings) { | |
try { | |
embeddings = await getEmbeddings(texts); | |
// Combine chunks with their embeddings | |
batch.forEach((chunk, index) => { | |
results.push({ | |
...chunk, | |
embedding: embeddings[index], | |
embeddingModel: MODEL_NAME, | |
processingTimestamp: new Date().toISOString(), | |
}); | |
}); | |
} catch (error) { | |
attempt++; | |
console.warn(`Batch ${Math.floor(i / CONFIG.batchSize) + 1} attempt ${attempt} failed:`, error.message); | |
if (attempt < CONFIG.retryAttempts) { | |
await new Promise((resolve) => setTimeout(resolve, CONFIG.retryDelay * attempt)); | |
} else { | |
// Add chunks without embeddings as fallback | |
batch.forEach((chunk) => { | |
results.push({ | |
...chunk, | |
embedding: null, | |
error: error.message, | |
embeddingModel: MODEL_NAME, | |
processingTimestamp: new Date().toISOString(), | |
}); | |
}); | |
} | |
} | |
} | |
} | |
return results; | |
} | |
/** | |
* Main processing function | |
* @returns {Promise} Promise resolving to processing results | |
*/ | |
async function processPageEmbeddings() { | |
try { | |
// Validate configuration | |
if (HF_TOKEN === 'your_hf_token_here') { | |
throw new Error('Please set your Hugging Face token in the HF_TOKEN variable'); | |
} | |
// Extract semantic chunks | |
console.log('Extracting semantic chunks from webpage...'); | |
const chunks = extractSemanticChunks(); | |
console.log(`Extracted ${chunks.length} semantic chunks`); | |
if (chunks.length === 0) { | |
throw new Error('No content chunks found on the page'); | |
} | |
// Process embeddings | |
console.log(`Processing embeddings using ${MODEL_NAME}...`); | |
const embeddedChunks = await processChunksWithRetry(chunks); | |
// Generate summary statistics | |
const successfulEmbeddings = embeddedChunks.filter((chunk) => chunk.embedding !== null).length; | |
const failedEmbeddings = embeddedChunks.length - successfulEmbeddings; | |
const result = { | |
success: true, | |
model: MODEL_NAME, | |
totalChunks: embeddedChunks.length, | |
successfulEmbeddings, | |
failedEmbeddings, | |
processingTimestamp: new Date().toISOString(), | |
pageUrl: window.location.href, | |
pageTitle: document.title, | |
chunks: embeddedChunks, | |
summary: { | |
avgChunkLength: Math.round(embeddedChunks.reduce((sum, chunk) => sum + chunk.text.length, 0) / embeddedChunks.length), | |
elementTypes: [...new Set(embeddedChunks.map((chunk) => chunk.metadata?.elementType).filter(Boolean))], | |
embeddingDimensions: embeddedChunks.find((chunk) => chunk.embedding)?.embedding?.length || null, | |
}, | |
}; | |
console.log(`Processing complete: ${successfulEmbeddings}/${embeddedChunks.length} chunks embedded successfully`); | |
return result; | |
} catch (error) { | |
console.error('Processing failed:', error); | |
return { | |
success: false, | |
error: error.message, | |
model: MODEL_NAME, | |
processingTimestamp: new Date().toISOString(), | |
pageUrl: window.location.href, | |
pageTitle: document.title, | |
}; | |
} | |
} | |
// Execute the main function and return results to Screaming Frog | |
return processPageEmbeddings() | |
.then((result) => seoSpider.data(JSON.stringify(result, null, 2))) | |
.catch((error) => seoSpider.error(`Script execution failed: ${error.message}`)); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment