sterlingsky · July 6, 2025 17:30
diff --git a/Passage vector embeddings recipe for Screaming Frog b/Passage vector embeddings recipe for Screaming Frog
 // Enhanced Screaming Frog Recipe: Semantic Chunking + Open Source Embeddings
 //
 // This script breaks webpage content into semantic passages (paragraphs, headings,
 // list items, etc.) and embeds each chunk using sentence-transformers/all-MiniLM-L6-v2,
 // a high-performing open-source model that ranks well on MTEB benchmarks.
 //
 // SETUP REQUIRED:
 // 1. Get a free Hugging Face token from: https://huggingface.co/settings/tokens
 // 2. Replace 'your_hf_token_here' below with your actual token
 // 3. The token will be stored in your SEO Spider configuration - be mindful when sharing
 //
 // MODEL INFO:
 // - Model: sentence-transformers/all-MiniLM-L6-v2 (384 dimensions)
 // - MTEB Performance: Strong performer among lightweight models (69.6% on MTEB)
 // - Cost: Free via Hugging Face Inference API (with rate limits)
 // - Context Length: Up to 512 tokens
 //
 // Uses the official Hugging Face Inference JS client
 //

 // Import the HfInference class from the CDN version of the package
 // This is necessary since we're running in a browser environment without module bundling
 const HF_TOKEN = 'paste in your HuggingFace token here';
 const MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2';

 // Create a class to mimic the HfInference client functionality using the correct router API endpoint
 class HfInferenceClient {
  constructor(token) {
    this.token = token;
  }

  async featureExtraction(options) {
    // For feature extraction (embeddings), we need to use the feature-extraction pipeline
    // We'll use the direct endpoint for feature-extraction as shown in the HF examples
    const url = `https://router.huggingface.co/hf-inference/models/${options.model}/pipeline/feature-extraction`;
    
    // For sentence-transformers models, we need to send each text as a separate request
    if (Array.isArray(options.inputs)) {
      // Process each input separately and return an array of embeddings
      const promises = options.inputs.map(text => this.getSingleEmbedding(url, text));
      return Promise.all(promises);
    } else {
      // Single input
      return this.getSingleEmbedding(url, options.inputs);
    }
  }
  
  async getSingleEmbedding(url, text) {
    // Using the correct request format for feature-extraction
    const response = await fetch(url, {
      method: 'POST',
      headers: {
        Authorization: `Bearer ${this.token}`,
        'Content-Type': 'application/json',
      },
      body: JSON.stringify({
        inputs: text,
      }),
    });

    if (!response.ok) {
      const errorText = await response.text();
      throw new Error(`Hugging Face API error (${response.status}): ${errorText}`);
    }

    return response.json();
  }
 }

 // Initialize the client
 const hfClient = new HfInferenceClient(HF_TOKEN);

 // Configuration
 const CONFIG = {
  minChunkLength: 50, // Minimum characters per chunk
  maxChunkLength: 500, // Maximum characters per chunk (reduced for smaller model)
  includeMetadata: true, // Include element type and position info
  batchSize: 5, // Number of chunks to process in parallel
  retryAttempts: 3, // Retry failed requests
  retryDelay: 1000, // Delay between retries (ms)
 };

 /**
 * Extract semantic passages from the webpage DOM
 * @returns {Array} Array of chunk objects with text and metadata
 */
 function extractSemanticChunks() {
  const chunks = [];
  let chunkIndex = 0;

  // Define semantic elements to extract
  const semanticSelectors = [
    'h1, h2, h3, h4, h5, h6', // Headings
    'p', // Paragraphs
    'li', // List items
    'blockquote', // Quotes
    'article', // Articles
    'section', // Sections
    'div[role="main"]', // Main content areas
    'td, th', // Table cells
    'figcaption', // Figure captions
    'summary', // Summary elements
    'dd', // Definition descriptions
  ];

  semanticSelectors.forEach((selector) => {
    const elements = document.querySelectorAll(selector);

    elements.forEach((element, index) => {
      const text = element.textContent?.trim();

      if (text && text.length >= CONFIG.minChunkLength) {
        // Split long chunks while preserving sentence boundaries
        const textChunks = splitLongText(text, CONFIG.maxChunkLength);

        textChunks.forEach((chunkText, subIndex) => {
          const chunk = {
            text: chunkText,
            index: chunkIndex++,
            metadata: CONFIG.includeMetadata
              ? {
                  elementType: element.tagName.toLowerCase(),
                  elementIndex: index,
                  subChunkIndex: subIndex,
                  totalSubChunks: textChunks.length,
                  xpath: getXPath(element),
                  textLength: chunkText.length,
                }
              : null,
          };

          chunks.push(chunk);
        });
      }
    });
  });

  // Fallback: if no semantic elements found, chunk the body text
  if (chunks.length === 0) {
    const bodyText = document.body.textContent?.trim();
    if (bodyText) {
      const textChunks = splitLongText(bodyText, CONFIG.maxChunkLength);
      textChunks.forEach((chunkText, index) => {
        chunks.push({
          text: chunkText,
          index: index,
          metadata: CONFIG.includeMetadata
            ? {
                elementType: 'body',
                elementIndex: 0,
                subChunkIndex: index,
                totalSubChunks: textChunks.length,
                xpath: '/html/body',
                textLength: chunkText.length,
              }
            : null,
        });
      });
    }
  }

  return chunks;
 }

 /**
 * Split long text into smaller chunks while preserving sentence boundaries
 * @param {string} text - Text to split
 * @param {number} maxLength - Maximum length per chunk
 * @returns {Array} Array of text chunks
 */
 function splitLongText(text, maxLength) {
  if (text.length <= maxLength) {
    return [text];
  }

  const chunks = [];
  const sentences = text.split(/(?<=[.!?])\s+/);
  let currentChunk = '';

  for (const sentence of sentences) {
    if ((currentChunk + sentence).length <= maxLength) {
      currentChunk += (currentChunk ? ' ' : '') + sentence;
    } else {
      if (currentChunk) {
        chunks.push(currentChunk);
        currentChunk = sentence;
      } else {
        // Handle very long sentences by splitting on word boundaries
        const words = sentence.split(' ');
        let wordChunk = '';

        for (const word of words) {
          if ((wordChunk + word).length <= maxLength) {
            wordChunk += (wordChunk ? ' ' : '') + word;
          } else {
            if (wordChunk) chunks.push(wordChunk);
            wordChunk = word;
          }
        }
        if (wordChunk) currentChunk = wordChunk;
      }
    }
  }

  if (currentChunk) {
    chunks.push(currentChunk);
  }

  return chunks.filter((chunk) => chunk.length >= CONFIG.minChunkLength);
 }

 /**
 * Get XPath for an element
 * @param {Element} element - DOM element
 * @returns {string} XPath string
 */
 function getXPath(element) {
  if (element.id) {
    return `//*[@id="${element.id}"]`;
  }

  const parts = [];
  while (element && element.nodeType === Node.ELEMENT_NODE) {
    let index = 0;
    let sibling = element.previousSibling;

    while (sibling) {
      if (sibling.nodeType === Node.ELEMENT_NODE && sibling.tagName === element.tagName) {
        index++;
      }
      sibling = sibling.previousSibling;
    }

    const tagName = element.tagName.toLowerCase();
    const pathIndex = index > 0 ? `[${index + 1}]` : '';
    parts.unshift(`${tagName}${pathIndex}`);

    element = element.parentNode;
  }

  return parts.length ? `/${parts.join('/')}` : '';
 }

 /**
 * Get embeddings for an array of texts using the HF client
 */
 async function getEmbeddings(texts) {
  try {
    return await hfClient.featureExtraction({
      model: MODEL_NAME,
      inputs: texts,
      options: {
        wait_for_model: true,
        use_cache: true
      }
    });
  } catch (error) {
    throw new Error(`Hugging Face API error: ${error.message}`);
  }
 }

 /**
 * Process chunks in batches with retry logic
 * @param {Array} chunks - Array of chunk objects
 * @returns {Promise} Promise resolving to embedded chunks
 */
 async function processChunksWithRetry(chunks) {
  const results = [];

  // Process in batches
  for (let i = 0; i < chunks.length; i += CONFIG.batchSize) {
    const batch = chunks.slice(i, i + CONFIG.batchSize);
    const texts = batch.map((chunk) => chunk.text);

    let attempt = 0;
    let embeddings = null;

    while (attempt < CONFIG.retryAttempts && !embeddings) {
      try {
        embeddings = await getEmbeddings(texts);

        // Combine chunks with their embeddings
        batch.forEach((chunk, index) => {
          results.push({
            ...chunk,
            embedding: embeddings[index],
            embeddingModel: MODEL_NAME,
            processingTimestamp: new Date().toISOString(),
          });
        });
      } catch (error) {
        attempt++;
        console.warn(`Batch ${Math.floor(i / CONFIG.batchSize) + 1} attempt ${attempt} failed:`, error.message);

        if (attempt < CONFIG.retryAttempts) {
          await new Promise((resolve) => setTimeout(resolve, CONFIG.retryDelay * attempt));
        } else {
          // Add chunks without embeddings as fallback
          batch.forEach((chunk) => {
            results.push({
              ...chunk,
              embedding: null,
              error: error.message,
              embeddingModel: MODEL_NAME,
              processingTimestamp: new Date().toISOString(),
            });
          });
        }
      }
    }
  }

  return results;
 }

 /**
 * Main processing function
 * @returns {Promise} Promise resolving to processing results
 */
 async function processPageEmbeddings() {
  try {
    // Validate configuration
    if (HF_TOKEN === 'your_hf_token_here') {
      throw new Error('Please set your Hugging Face token in the HF_TOKEN variable');
    }

    // Extract semantic chunks
    console.log('Extracting semantic chunks from webpage...');
    const chunks = extractSemanticChunks();
    console.log(`Extracted ${chunks.length} semantic chunks`);

    if (chunks.length === 0) {
      throw new Error('No content chunks found on the page');
    }

    // Process embeddings
    console.log(`Processing embeddings using ${MODEL_NAME}...`);
    const embeddedChunks = await processChunksWithRetry(chunks);

    // Generate summary statistics
    const successfulEmbeddings = embeddedChunks.filter((chunk) => chunk.embedding !== null).length;
    const failedEmbeddings = embeddedChunks.length - successfulEmbeddings;

    const result = {
      success: true,
      model: MODEL_NAME,
      totalChunks: embeddedChunks.length,
      successfulEmbeddings,
      failedEmbeddings,
      processingTimestamp: new Date().toISOString(),
      pageUrl: window.location.href,
      pageTitle: document.title,
      chunks: embeddedChunks,
      summary: {
        avgChunkLength: Math.round(embeddedChunks.reduce((sum, chunk) => sum + chunk.text.length, 0) / embeddedChunks.length),
        elementTypes: [...new Set(embeddedChunks.map((chunk) => chunk.metadata?.elementType).filter(Boolean))],
        embeddingDimensions: embeddedChunks.find((chunk) => chunk.embedding)?.embedding?.length || null,
      },
    };

    console.log(`Processing complete: ${successfulEmbeddings}/${embeddedChunks.length} chunks embedded successfully`);
    return result;
  } catch (error) {
    console.error('Processing failed:', error);
    return {
      success: false,
      error: error.message,
      model: MODEL_NAME,
      processingTimestamp: new Date().toISOString(),
      pageUrl: window.location.href,
      pageTitle: document.title,
    };
  }
 }

 // Execute the main function and return results to Screaming Frog
 return processPageEmbeddings()
  .then((result) => seoSpider.data(JSON.stringify(result, null, 2)))
  .catch((error) => seoSpider.error(`Script execution failed: ${error.message}`));
	// Enhanced Screaming Frog Recipe: Semantic Chunking + Open Source Embeddings
	//
	// This script breaks webpage content into semantic passages (paragraphs, headings,
	// list items, etc.) and embeds each chunk using sentence-transformers/all-MiniLM-L6-v2,
	// a high-performing open-source model that ranks well on MTEB benchmarks.
	//
	// SETUP REQUIRED:
	// 1. Get a free Hugging Face token from: https://huggingface.co/settings/tokens
	// 2. Replace 'your_hf_token_here' below with your actual token
	// 3. The token will be stored in your SEO Spider configuration - be mindful when sharing
	//
	// MODEL INFO:
	// - Model: sentence-transformers/all-MiniLM-L6-v2 (384 dimensions)
	// - MTEB Performance: Strong performer among lightweight models (69.6% on MTEB)
	// - Cost: Free via Hugging Face Inference API (with rate limits)
	// - Context Length: Up to 512 tokens
	//
	// Uses the official Hugging Face Inference JS client
	//

	// Import the HfInference class from the CDN version of the package
	// This is necessary since we're running in a browser environment without module bundling
	const HF_TOKEN = 'paste in your HuggingFace token here';
	const MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2';

	// Create a class to mimic the HfInference client functionality using the correct router API endpoint
	class HfInferenceClient {
	constructor(token) {
	this.token = token;
	}

	async featureExtraction(options) {
	// For feature extraction (embeddings), we need to use the feature-extraction pipeline
	// We'll use the direct endpoint for feature-extraction as shown in the HF examples
	const url = `https://router.huggingface.co/hf-inference/models/${options.model}/pipeline/feature-extraction`;

	// For sentence-transformers models, we need to send each text as a separate request
	if (Array.isArray(options.inputs)) {
	// Process each input separately and return an array of embeddings
	const promises = options.inputs.map(text => this.getSingleEmbedding(url, text));
	return Promise.all(promises);
	} else {
	// Single input
	return this.getSingleEmbedding(url, options.inputs);
	}
	}

	async getSingleEmbedding(url, text) {
	// Using the correct request format for feature-extraction
	const response = await fetch(url, {
	method: 'POST',
	headers: {
	Authorization: `Bearer ${this.token}`,
	'Content-Type': 'application/json',
	},
	body: JSON.stringify({
	inputs: text,
	}),
	});

	if (!response.ok) {
	const errorText = await response.text();
	throw new Error(`Hugging Face API error (${response.status}): ${errorText}`);
	}

	return response.json();
	}
	}

	// Initialize the client
	const hfClient = new HfInferenceClient(HF_TOKEN);

	// Configuration
	const CONFIG = {
	minChunkLength: 50, // Minimum characters per chunk
	maxChunkLength: 500, // Maximum characters per chunk (reduced for smaller model)
	includeMetadata: true, // Include element type and position info
	batchSize: 5, // Number of chunks to process in parallel
	retryAttempts: 3, // Retry failed requests
	retryDelay: 1000, // Delay between retries (ms)
	};

	/**
	* Extract semantic passages from the webpage DOM
	* @returns {Array} Array of chunk objects with text and metadata
	*/
	function extractSemanticChunks() {
	const chunks = [];
	let chunkIndex = 0;

	// Define semantic elements to extract
	const semanticSelectors = [
	'h1, h2, h3, h4, h5, h6', // Headings
	'p', // Paragraphs
	'li', // List items
	'blockquote', // Quotes
	'article', // Articles
	'section', // Sections
	'div[role="main"]', // Main content areas
	'td, th', // Table cells
	'figcaption', // Figure captions
	'summary', // Summary elements
	'dd', // Definition descriptions
	];

	semanticSelectors.forEach((selector) => {
	const elements = document.querySelectorAll(selector);

	elements.forEach((element, index) => {
	const text = element.textContent?.trim();

	if (text && text.length >= CONFIG.minChunkLength) {
	// Split long chunks while preserving sentence boundaries
	const textChunks = splitLongText(text, CONFIG.maxChunkLength);

	textChunks.forEach((chunkText, subIndex) => {
	const chunk = {
	text: chunkText,
	index: chunkIndex++,
	metadata: CONFIG.includeMetadata
	? {
	elementType: element.tagName.toLowerCase(),
	elementIndex: index,
	subChunkIndex: subIndex,
	totalSubChunks: textChunks.length,
	xpath: getXPath(element),
	textLength: chunkText.length,
	}
	: null,
	};

	chunks.push(chunk);
	});
	}
	});
	});

	// Fallback: if no semantic elements found, chunk the body text
	if (chunks.length === 0) {
	const bodyText = document.body.textContent?.trim();
	if (bodyText) {
	const textChunks = splitLongText(bodyText, CONFIG.maxChunkLength);
	textChunks.forEach((chunkText, index) => {
	chunks.push({
	text: chunkText,
	index: index,
	metadata: CONFIG.includeMetadata
	? {
	elementType: 'body',
	elementIndex: 0,
	subChunkIndex: index,
	totalSubChunks: textChunks.length,
	xpath: '/html/body',
	textLength: chunkText.length,
	}
	: null,
	});
	});
	}
	}

	return chunks;
	}

	/**
	* Split long text into smaller chunks while preserving sentence boundaries
	* @param {string} text - Text to split
	* @param {number} maxLength - Maximum length per chunk
	* @returns {Array} Array of text chunks
	*/
	function splitLongText(text, maxLength) {
	if (text.length <= maxLength) {
	return [text];
	}

	const chunks = [];
	const sentences = text.split(/(?<=[.!?])\s+/);
	let currentChunk = '';

	for (const sentence of sentences) {
	if ((currentChunk + sentence).length <= maxLength) {
	currentChunk += (currentChunk ? ' ' : '') + sentence;
	} else {
	if (currentChunk) {
	chunks.push(currentChunk);
	currentChunk = sentence;
	} else {
	// Handle very long sentences by splitting on word boundaries
	const words = sentence.split(' ');
	let wordChunk = '';

	for (const word of words) {
	if ((wordChunk + word).length <= maxLength) {
	wordChunk += (wordChunk ? ' ' : '') + word;
	} else {
	if (wordChunk) chunks.push(wordChunk);
	wordChunk = word;
	}
	}
	if (wordChunk) currentChunk = wordChunk;
	}
	}
	}

	if (currentChunk) {
	chunks.push(currentChunk);
	}

	return chunks.filter((chunk) => chunk.length >= CONFIG.minChunkLength);
	}

	/**
	* Get XPath for an element
	* @param {Element} element - DOM element
	* @returns {string} XPath string
	*/
	function getXPath(element) {
	if (element.id) {
	return `//*[@id="${element.id}"]`;
	}

	const parts = [];
	while (element && element.nodeType === Node.ELEMENT_NODE) {
	let index = 0;
	let sibling = element.previousSibling;

	while (sibling) {
	if (sibling.nodeType === Node.ELEMENT_NODE && sibling.tagName === element.tagName) {
	index++;
	}
	sibling = sibling.previousSibling;
	}

	const tagName = element.tagName.toLowerCase();
	const pathIndex = index > 0 ? `[${index + 1}]` : '';
	parts.unshift(`${tagName}${pathIndex}`);

	element = element.parentNode;
	}

	return parts.length ? `/${parts.join('/')}` : '';
	}

	/**
	* Get embeddings for an array of texts using the HF client
	*/
	async function getEmbeddings(texts) {
	try {
	return await hfClient.featureExtraction({
	model: MODEL_NAME,
	inputs: texts,
	options: {
	wait_for_model: true,
	use_cache: true
	}
	});
	} catch (error) {
	throw new Error(`Hugging Face API error: ${error.message}`);
	}
	}

	/**
	* Process chunks in batches with retry logic
	* @param {Array} chunks - Array of chunk objects
	* @returns {Promise} Promise resolving to embedded chunks
	*/
	async function processChunksWithRetry(chunks) {
	const results = [];

	// Process in batches
	for (let i = 0; i < chunks.length; i += CONFIG.batchSize) {
	const batch = chunks.slice(i, i + CONFIG.batchSize);
	const texts = batch.map((chunk) => chunk.text);

	let attempt = 0;
	let embeddings = null;

	while (attempt < CONFIG.retryAttempts && !embeddings) {
	try {
	embeddings = await getEmbeddings(texts);

	// Combine chunks with their embeddings
	batch.forEach((chunk, index) => {
	results.push({
	...chunk,
	embedding: embeddings[index],
	embeddingModel: MODEL_NAME,
	processingTimestamp: new Date().toISOString(),
	});
	});
	} catch (error) {
	attempt++;
	console.warn(`Batch ${Math.floor(i / CONFIG.batchSize) + 1} attempt ${attempt} failed:`, error.message);

	if (attempt < CONFIG.retryAttempts) {
	await new Promise((resolve) => setTimeout(resolve, CONFIG.retryDelay * attempt));
	} else {
	// Add chunks without embeddings as fallback
	batch.forEach((chunk) => {
	results.push({
	...chunk,
	embedding: null,
	error: error.message,
	embeddingModel: MODEL_NAME,
	processingTimestamp: new Date().toISOString(),
	});
	});
	}
	}
	}
	}

	return results;
	}

	/**
	* Main processing function
	* @returns {Promise} Promise resolving to processing results
	*/
	async function processPageEmbeddings() {
	try {
	// Validate configuration
	if (HF_TOKEN === 'your_hf_token_here') {
	throw new Error('Please set your Hugging Face token in the HF_TOKEN variable');
	}

	// Extract semantic chunks
	console.log('Extracting semantic chunks from webpage...');
	const chunks = extractSemanticChunks();
	console.log(`Extracted ${chunks.length} semantic chunks`);

	if (chunks.length === 0) {
	throw new Error('No content chunks found on the page');
	}

	// Process embeddings
	console.log(`Processing embeddings using ${MODEL_NAME}...`);
	const embeddedChunks = await processChunksWithRetry(chunks);

	// Generate summary statistics
	const successfulEmbeddings = embeddedChunks.filter((chunk) => chunk.embedding !== null).length;
	const failedEmbeddings = embeddedChunks.length - successfulEmbeddings;

	const result = {
	success: true,
	model: MODEL_NAME,
	totalChunks: embeddedChunks.length,
	successfulEmbeddings,
	failedEmbeddings,
	processingTimestamp: new Date().toISOString(),
	pageUrl: window.location.href,
	pageTitle: document.title,
	chunks: embeddedChunks,
	summary: {
	avgChunkLength: Math.round(embeddedChunks.reduce((sum, chunk) => sum + chunk.text.length, 0) / embeddedChunks.length),
	elementTypes: [...new Set(embeddedChunks.map((chunk) => chunk.metadata?.elementType).filter(Boolean))],
	embeddingDimensions: embeddedChunks.find((chunk) => chunk.embedding)?.embedding?.length \|\| null,
	},
	};

	console.log(`Processing complete: ${successfulEmbeddings}/${embeddedChunks.length} chunks embedded successfully`);
	return result;
	} catch (error) {
	console.error('Processing failed:', error);
	return {
	success: false,
	error: error.message,
	model: MODEL_NAME,
	processingTimestamp: new Date().toISOString(),
	pageUrl: window.location.href,
	pageTitle: document.title,
	};
	}
	}

	// Execute the main function and return results to Screaming Frog
	return processPageEmbeddings()
	.then((result) => seoSpider.data(JSON.stringify(result, null, 2)))
	.catch((error) => seoSpider.error(`Script execution failed: ${error.message}`));