7etsuo · March 17, 2025 00:59
diff --git a/scraper.c b/scraper.c
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <libxml/HTMLparser.h>
 #include <libxml/xpath.h>
 #include <libxml/uri.h>

 // Initial hash table size (prime number for better distribution)
 #define INITIAL_HASH_SIZE 101
 #define MAX_LOAD_FACTOR 0.75

 int is_prime (int n);

 // Structure for hash table entries
 typedef struct hash_entry
 {
  char *url;               // URL string
  struct hash_entry *next; // Next entry in chain
 } hash_entry_t;

 // Structure to hold URL collection
 typedef struct
 {
  char **urls;               // Dynamic array of URL strings
  int count;                 // Number of URLs in the collection
  int capacity;              // Current capacity of the array
  hash_entry_t **hash_table; // Hash table for O(1) duplicate checking
  int hash_size;             // Current size of hash table
  int hash_count;            // Number of entries in hash table
 } url_collection_t;

 /**
 * Initialize a new URL collection
 * Returns a pointer to the collection or NULL on failure
 */
 url_collection_t *
 create_url_collection (int initial_capacity)
 {
  // Use a reasonable default if no capacity provided
  if (initial_capacity <= 0)
  {
    initial_capacity = 8;
  }

  // Allocate the collection struct
  url_collection_t *collection
    = (url_collection_t *) malloc (sizeof (url_collection_t));
  if (!collection)
  {
    fprintf (stderr, "Failed to allocate URL collection.\n");
    return NULL;
  }

  // Initialize to zero/NULL
  memset (collection, 0, sizeof (url_collection_t));

  // Allocate the URL array
  collection->urls = (char **) malloc (initial_capacity * sizeof (char *));
  if (!collection->urls)
  {
    fprintf (stderr, "Failed to allocate memory for URL array.\n");
    free (collection);
    return NULL;
  }

  collection->capacity = initial_capacity;

  // Initialize hash table
  collection->hash_table
    = (hash_entry_t **) calloc (INITIAL_HASH_SIZE, sizeof (hash_entry_t *));
  if (!collection->hash_table)
  {
    fprintf (stderr, "Failed to initialize hash table.\n");
    free (collection->urls);
    free (collection);
    return NULL;
  }

  collection->hash_size = INITIAL_HASH_SIZE;

  return collection;
 }

 /**
 * Simple hash function for strings
 */
 unsigned int
 hash_string (const char *str, int table_size)
 {
  unsigned int hash = 0;
  while (*str)
  {
    hash = hash * 31 + (*str++);
  }
  return hash % table_size;
 }

 /**
 * Check if a number is prime (helper for hash table resizing)
 */
 int
 is_prime (int n)
 {
  if (n <= 1)
    return 0;
  if (n <= 3)
    return 1;
  if (n % 2 == 0 || n % 3 == 0)
    return 0;

  for (int i = 5; i * i <= n; i += 6)
  {
    if (n % i == 0 || n % (i + 2) == 0)
      return 0;
  }

  return 1;
 }

 /**
 * Check if a URL is already in the hash table
 * Returns 1 if found, 0 if not found
 */
 int
 url_exists (url_collection_t *collection, const char *url)
 {
  if (!collection || !collection->hash_table)
  {
    return 0;
  }

  unsigned int hash = hash_string (url, collection->hash_size);

  // Find linked list at this hash position
  hash_entry_t *entry = collection->hash_table[hash];
  while (entry != NULL)
  {
    // Compare to existing URLs
    if (strcmp (entry->url, url) == 0)
    {
      return 1; // URL already exists
    }

    // Move to next entry in hash chain
    entry = entry->next;
  }

  return 0; // URL not found
 }

 /**
 * Resize hash table when load factor exceeds threshold
 * Returns 1 on success, 0 on failure
 */
 int
 resize_hash_table (url_collection_t *collection)
 {
  int old_size = collection->hash_size;
  hash_entry_t **old_table = collection->hash_table;

  // Find next prime number approximately double the current size
  int new_size = old_size * 2 + 1;
  while (!is_prime (new_size))
  {
    new_size += 2;
  }

  // Allocate new table
  hash_entry_t **new_table
    = (hash_entry_t **) calloc (new_size, sizeof (hash_entry_t *));
  if (!new_table)
  {
    fprintf (stderr, "Failed to allocate new hash table\n");
    return 0;
  }

  // Rehash all entries
  for (int i = 0; i < old_size; i++)
  {
    hash_entry_t *entry = old_table[i];
    while (entry != NULL)
    {
      // Save next pointer before we modify the entry
      hash_entry_t *next = entry->next;

      // Insert into new table
      unsigned int hash = hash_string (entry->url, new_size);
      entry->next = new_table[hash];
      new_table[hash] = entry;

      entry = next;
    }
  }

  // Update collection with new table
  collection->hash_table = new_table;
  collection->hash_size = new_size;

  // Free the old table (but not entries - they're reused)
  free (old_table);

  return 1;
 }

 /**
 * Add URL to hash table for duplicate checking
 * Returns 1 on success, 0 on failure
 */
 int
 add_to_hash (url_collection_t *collection, const char *url)
 {
  // Check if we need to resize the hash table
  double load_factor
    = (double) (collection->hash_count + 1) / collection->hash_size;
  if (load_factor > MAX_LOAD_FACTOR)
  {
    if (!resize_hash_table (collection))
    {
      return 0;
    }
  }

  // Create new entry
  hash_entry_t *entry = (hash_entry_t *) malloc (sizeof (hash_entry_t));
  if (!entry)
  {
    fprintf (stderr, "Failed to allocate hash entry\n");
    return 0;
  }

  // Copy the URL
  entry->url = strdup (url);
  if (!entry->url)
  {
    fprintf (stderr, "Failed to duplicate URL for hash table\n");
    free (entry);
    return 0;
  }

  // Insert at head of chain
  unsigned int hash = hash_string (url, collection->hash_size);
  entry->next = collection->hash_table[hash];
  collection->hash_table[hash] = entry;

  collection->hash_count++;
  return 1;
 }

 /**
 * Add a URL to the collection
 * The collection will automatically resize if needed
 * Duplicate URLs will not be added (O(1) lookup)
 * Returns 1 on success, 0 on failure
 */
 int
 add_url (url_collection_t *collection, const char *url)
 {
  if (!collection)
  {
    return 0;
  }

  // Check if this URL already exists in our collection
  if (url_exists (collection, url))
  {
    printf ("Skipping duplicate URL: %s\n", url);
    return 1; // Not an error, just a duplicate
  }

  // Check if we need to resize the array
  if (collection->count >= collection->capacity)
  {
    // Double the capacity when resizing (common growth strategy)
    int new_capacity = collection->capacity * 2;
    char **new_urls
      = (char **) realloc (collection->urls, new_capacity * sizeof (char *));
    if (new_urls == NULL)
    {
      fprintf (stderr, "Failed to resize URL collection.\n");
      return 0;
    }

    collection->urls = new_urls;
    collection->capacity = new_capacity;
  }

  // Add the URL to the collection
  collection->urls[collection->count] = strdup (url);
  if (collection->urls[collection->count] == NULL)
  {
    fprintf (stderr, "Failed to duplicate URL string.\n");
    return 0;
  }

  // Add to hash table for O(1) duplicate checking
  if (!add_to_hash (collection, url))
  {
    // Failed to add to hash table, free the URL string
    free (collection->urls[collection->count]);
    return 0;
  }

  collection->count++;
  printf ("Added URL to collection: %s\n", url);
  return 1;
 }

 /**
 * Free the URL collection and hash table
 */
 void
 free_url_collection (url_collection_t *collection)
 {
  if (!collection)
  {
    return;
  }

  // Free URL strings
  if (collection->urls)
  {
    for (int i = 0; i < collection->count; i++)
    {
      free (collection->urls[i]);
    }

    free (collection->urls);
  }

  // Free hash table entries
  if (collection->hash_table)
  {
    for (int i = 0; i < collection->hash_size; i++)
    {
      hash_entry_t *entry = collection->hash_table[i];
      while (entry != NULL)
      {
        hash_entry_t *next = entry->next;
        free (entry->url);
        free (entry);
        entry = next;
      }
    }

    free (collection->hash_table);
  }

  // Finally free the collection struct itself
  free (collection);
 }

 /**
 * Print all URLs in the collection
 */
 void
 print_url_collection (url_collection_t *collection)
 {
  if (!collection)
  {
    printf ("No collection to print.\n");
    return;
  }

  printf ("\nCollected Image URLs (%d):\n", collection->count);
  printf ("------------------------------------------------\n");
  for (int i = 0; i < collection->count; i++)
  {
    printf ("[%d] %s\n", i + 1, collection->urls[i]);
  }
  printf ("------------------------------------------------\n");
 }

 /**
 * Process HTML and extract image URLs using XPath
 * Adds all found URLs to the provided collection
 * Returns number of URLs found or -1 on error
 */
 int
 extract_image_urls (url_collection_t *collection, const char *html,
                    const char *base_url)
 {
  if (!collection || !html)
  {
    fprintf (stderr, "Invalid parameters for extract_image_urls\n");
    return -1;
  }

  int initial_count = collection->count;

  // Initialize libxml2
  xmlInitParser ();

  // Parse the HTML string into a DOM document
  htmlDocPtr doc = htmlReadMemory (html, strlen (html), base_url, NULL,
                                   HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
  if (doc == NULL)
  {
    fprintf (stderr, "Failed to parse HTML.\n");
    xmlCleanupParser ();
    return -1;
  }

  // Create XPath evaluation context
  xmlXPathContextPtr xpath_ctx = xmlXPathNewContext (doc);
  if (xpath_ctx == NULL)
  {
    fprintf (stderr, "Failed to create XPath context.\n");
    xmlFreeDoc (doc);
    xmlCleanupParser ();
    return -1;
  }

  // Evaluate XPath expression to get all img src attributes
  xmlXPathObjectPtr xpath_obj
    = xmlXPathEvalExpression ((const xmlChar *) "//img/@src", xpath_ctx);
  if (xpath_obj == NULL)
  {
    fprintf (stderr, "Failed to evaluate XPath expression.\n");
    xmlXPathFreeContext (xpath_ctx);
    xmlFreeDoc (doc);
    xmlCleanupParser ();
    return -1;
  }

  // Process the results
  xmlNodeSetPtr nodes = xpath_obj->nodesetval;
  if (nodes)
  {
    for (int i = 0; i < nodes->nodeNr; i++)
    {
      xmlNodePtr node = nodes->nodeTab[i];
      if (node->type == XML_ATTRIBUTE_NODE)
      {
        xmlChar *src = xmlNodeGetContent (node);
        if (src)
        {
          // Handle different types of URLs if base_url is provided
          if (base_url && !strstr ((char *) src, "://"))
          {
            xmlChar *full_url = NULL;

            if (src[0] == '/')
            {
              // URLs starting with / need the domain part of base_url
              char *domain_end = NULL;
              char domain[1024] = {0};

              // Extract domain from base_url (e.g., https://example.com)
              if (strstr (base_url, "://"))
              {
                const char *proto_end = strstr (base_url, "://") + 3;
                domain_end = strchr (proto_end, '/');

                if (domain_end)
                {
                  // Copy only up to the first slash after the domain
                  int domain_len = domain_end - base_url;
                  strncpy (domain, base_url, domain_len);
                  domain[domain_len] = '\0';
                }
                else
                {
                  // No slash after domain, use entire base_url
                  strcpy (domain, base_url);
                }

                // Construct the full URL using domain + src
                char *full_path = (char *) malloc (strlen (domain)
                                                   + strlen ((char *) src) + 1);
                if (full_path)
                {
                  sprintf (full_path, "%s%s", domain, (char *) src);
                  add_url (collection, full_path);
                  free (full_path);
                }
              }
              else
              {
                add_url (collection, (char *) src);
              }
            }
            else
            {
              // Regular relative URLs (without leading slash)
              full_url = xmlBuildURI (src, (const xmlChar *) base_url);
              if (full_url)
              {
                add_url (collection, (char *) full_url);
                xmlFree (full_url);
              }
              else
              {
                add_url (collection, (char *) src);
              }
            }
          }
          else
          {
            // Absolute URLs or no base_url provided
            add_url (collection, (char *) src);
          }

          xmlFree (src);
        }
      }
    }
  }

  // Cleanup
  xmlXPathFreeObject (xpath_obj);
  xmlXPathFreeContext (xpath_ctx);
  xmlFreeDoc (doc);
  xmlCleanupParser ();

  // Return the actual number of new URLs added
  return collection->count - initial_count;
 }

 // Example usage
 int
 main (int argc, char *argv[])
 {
  // Create a new URL collection
  url_collection_t *collection = create_url_collection (0);
  if (!collection)
  {
    fprintf (stderr, "Failed to create URL collection.\n");
    return 1;
  }

  // Example HTML with various img tags
  const char *html_example
    = "<html><body>"
      "<h1>Sample Page</h1>"
      "<p>Text with <img src='/images/image3.gif'> embedded image</p>"
      "<img src='image1.jpg' alt='Image 1'>"
      "<div><img src='https://example.com/image2.png' alt='Image 2'></div>"
      "<p>Text with <img src='/images/image3.gif'> embedded image</p>"
      "<img src='image1.jpg' alt='Image 1'>"
      "</body></html>";

  const char *base_url = "https://example.com/page/";

  printf ("Extracting image URLs from sample HTML:\n");
  int found = extract_image_urls (collection, html_example, base_url);
  printf ("Found %d unique image URLs\n", found);

  // Print collected URLs
  print_url_collection (collection);

  // Free the URL collection
  free_url_collection (collection);

  return 0;
 }
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <libxml/HTMLparser.h>
	#include <libxml/xpath.h>
	#include <libxml/uri.h>

	// Initial hash table size (prime number for better distribution)
	#define INITIAL_HASH_SIZE 101
	#define MAX_LOAD_FACTOR 0.75

	int is_prime (int n);

	// Structure for hash table entries
	typedef struct hash_entry
	{
	char *url; // URL string
	struct hash_entry *next; // Next entry in chain
	} hash_entry_t;

	// Structure to hold URL collection
	typedef struct
	{
	char **urls; // Dynamic array of URL strings
	int count; // Number of URLs in the collection
	int capacity; // Current capacity of the array
	hash_entry_t **hash_table; // Hash table for O(1) duplicate checking
	int hash_size; // Current size of hash table
	int hash_count; // Number of entries in hash table
	} url_collection_t;

	/**
	* Initialize a new URL collection
	* Returns a pointer to the collection or NULL on failure
	*/
	url_collection_t *
	create_url_collection (int initial_capacity)
	{
	// Use a reasonable default if no capacity provided
	if (initial_capacity <= 0)
	{
	initial_capacity = 8;
	}

	// Allocate the collection struct
	url_collection_t *collection
	= (url_collection_t *) malloc (sizeof (url_collection_t));
	if (!collection)
	{
	fprintf (stderr, "Failed to allocate URL collection.\n");
	return NULL;
	}

	// Initialize to zero/NULL
	memset (collection, 0, sizeof (url_collection_t));

	// Allocate the URL array
	collection->urls = (char *) malloc (initial_capacity sizeof (char *));
	if (!collection->urls)
	{
	fprintf (stderr, "Failed to allocate memory for URL array.\n");
	free (collection);
	return NULL;
	}

	collection->capacity = initial_capacity;

	// Initialize hash table
	collection->hash_table
	= (hash_entry_t *) calloc (INITIAL_HASH_SIZE, sizeof (hash_entry_t ));
	if (!collection->hash_table)
	{
	fprintf (stderr, "Failed to initialize hash table.\n");
	free (collection->urls);
	free (collection);
	return NULL;
	}

	collection->hash_size = INITIAL_HASH_SIZE;

	return collection;
	}

	/**
	* Simple hash function for strings
	*/
	unsigned int
	hash_string (const char *str, int table_size)
	{
	unsigned int hash = 0;
	while (*str)
	{
	hash = hash * 31 + (*str++);
	}
	return hash % table_size;
	}

	/**
	* Check if a number is prime (helper for hash table resizing)
	*/
	int
	is_prime (int n)
	{
	if (n <= 1)
	return 0;
	if (n <= 3)
	return 1;
	if (n % 2 == 0 \|\| n % 3 == 0)
	return 0;

	for (int i = 5; i * i <= n; i += 6)
	{
	if (n % i == 0 \|\| n % (i + 2) == 0)
	return 0;
	}

	return 1;
	}

	/**
	* Check if a URL is already in the hash table
	* Returns 1 if found, 0 if not found
	*/
	int
	url_exists (url_collection_t collection, const char url)
	{
	if (!collection \|\| !collection->hash_table)
	{
	return 0;
	}

	unsigned int hash = hash_string (url, collection->hash_size);

	// Find linked list at this hash position
	hash_entry_t *entry = collection->hash_table[hash];
	while (entry != NULL)
	{
	// Compare to existing URLs
	if (strcmp (entry->url, url) == 0)
	{
	return 1; // URL already exists
	}

	// Move to next entry in hash chain
	entry = entry->next;
	}

	return 0; // URL not found
	}

	/**
	* Resize hash table when load factor exceeds threshold
	* Returns 1 on success, 0 on failure
	*/
	int
	resize_hash_table (url_collection_t *collection)
	{
	int old_size = collection->hash_size;
	hash_entry_t **old_table = collection->hash_table;

	// Find next prime number approximately double the current size
	int new_size = old_size * 2 + 1;
	while (!is_prime (new_size))
	{
	new_size += 2;
	}

	// Allocate new table
	hash_entry_t **new_table
	= (hash_entry_t *) calloc (new_size, sizeof (hash_entry_t ));
	if (!new_table)
	{
	fprintf (stderr, "Failed to allocate new hash table\n");
	return 0;
	}

	// Rehash all entries
	for (int i = 0; i < old_size; i++)
	{
	hash_entry_t *entry = old_table[i];
	while (entry != NULL)
	{
	// Save next pointer before we modify the entry
	hash_entry_t *next = entry->next;

	// Insert into new table
	unsigned int hash = hash_string (entry->url, new_size);
	entry->next = new_table[hash];
	new_table[hash] = entry;

	entry = next;
	}
	}

	// Update collection with new table
	collection->hash_table = new_table;
	collection->hash_size = new_size;

	// Free the old table (but not entries - they're reused)
	free (old_table);

	return 1;
	}

	/**
	* Add URL to hash table for duplicate checking
	* Returns 1 on success, 0 on failure
	*/
	int
	add_to_hash (url_collection_t collection, const char url)
	{
	// Check if we need to resize the hash table
	double load_factor
	= (double) (collection->hash_count + 1) / collection->hash_size;
	if (load_factor > MAX_LOAD_FACTOR)
	{
	if (!resize_hash_table (collection))
	{
	return 0;
	}
	}

	// Create new entry
	hash_entry_t entry = (hash_entry_t ) malloc (sizeof (hash_entry_t));
	if (!entry)
	{
	fprintf (stderr, "Failed to allocate hash entry\n");
	return 0;
	}

	// Copy the URL
	entry->url = strdup (url);
	if (!entry->url)
	{
	fprintf (stderr, "Failed to duplicate URL for hash table\n");
	free (entry);
	return 0;
	}

	// Insert at head of chain
	unsigned int hash = hash_string (url, collection->hash_size);
	entry->next = collection->hash_table[hash];
	collection->hash_table[hash] = entry;

	collection->hash_count++;
	return 1;
	}

	/**
	* Add a URL to the collection
	* The collection will automatically resize if needed
	* Duplicate URLs will not be added (O(1) lookup)
	* Returns 1 on success, 0 on failure
	*/
	int
	add_url (url_collection_t collection, const char url)
	{
	if (!collection)
	{
	return 0;
	}

	// Check if this URL already exists in our collection
	if (url_exists (collection, url))
	{
	printf ("Skipping duplicate URL: %s\n", url);
	return 1; // Not an error, just a duplicate
	}

	// Check if we need to resize the array
	if (collection->count >= collection->capacity)
	{
	// Double the capacity when resizing (common growth strategy)
	int new_capacity = collection->capacity * 2;
	char **new_urls
	= (char *) realloc (collection->urls, new_capacity sizeof (char *));
	if (new_urls == NULL)
	{
	fprintf (stderr, "Failed to resize URL collection.\n");
	return 0;
	}

	collection->urls = new_urls;
	collection->capacity = new_capacity;
	}

	// Add the URL to the collection
	collection->urls[collection->count] = strdup (url);
	if (collection->urls[collection->count] == NULL)
	{
	fprintf (stderr, "Failed to duplicate URL string.\n");
	return 0;
	}

	// Add to hash table for O(1) duplicate checking
	if (!add_to_hash (collection, url))
	{
	// Failed to add to hash table, free the URL string
	free (collection->urls[collection->count]);
	return 0;
	}

	collection->count++;
	printf ("Added URL to collection: %s\n", url);
	return 1;
	}

	/**
	* Free the URL collection and hash table
	*/
	void
	free_url_collection (url_collection_t *collection)
	{
	if (!collection)
	{
	return;
	}

	// Free URL strings
	if (collection->urls)
	{
	for (int i = 0; i < collection->count; i++)
	{
	free (collection->urls[i]);
	}

	free (collection->urls);
	}

	// Free hash table entries
	if (collection->hash_table)
	{
	for (int i = 0; i < collection->hash_size; i++)
	{
	hash_entry_t *entry = collection->hash_table[i];
	while (entry != NULL)
	{
	hash_entry_t *next = entry->next;
	free (entry->url);
	free (entry);
	entry = next;
	}
	}

	free (collection->hash_table);
	}

	// Finally free the collection struct itself
	free (collection);
	}

	/**
	* Print all URLs in the collection
	*/
	void
	print_url_collection (url_collection_t *collection)
	{
	if (!collection)
	{
	printf ("No collection to print.\n");
	return;
	}

	printf ("\nCollected Image URLs (%d):\n", collection->count);
	printf ("------------------------------------------------\n");
	for (int i = 0; i < collection->count; i++)
	{
	printf ("[%d] %s\n", i + 1, collection->urls[i]);
	}
	printf ("------------------------------------------------\n");
	}

	/**
	* Process HTML and extract image URLs using XPath
	* Adds all found URLs to the provided collection
	* Returns number of URLs found or -1 on error
	*/
	int
	extract_image_urls (url_collection_t collection, const char html,
	const char *base_url)
	{
	if (!collection \|\| !html)
	{
	fprintf (stderr, "Invalid parameters for extract_image_urls\n");
	return -1;
	}

	int initial_count = collection->count;

	// Initialize libxml2
	xmlInitParser ();

	// Parse the HTML string into a DOM document
	htmlDocPtr doc = htmlReadMemory (html, strlen (html), base_url, NULL,
	HTML_PARSE_NOERROR \| HTML_PARSE_NOWARNING);
	if (doc == NULL)
	{
	fprintf (stderr, "Failed to parse HTML.\n");
	xmlCleanupParser ();
	return -1;
	}

	// Create XPath evaluation context
	xmlXPathContextPtr xpath_ctx = xmlXPathNewContext (doc);
	if (xpath_ctx == NULL)
	{
	fprintf (stderr, "Failed to create XPath context.\n");
	xmlFreeDoc (doc);
	xmlCleanupParser ();
	return -1;
	}

	// Evaluate XPath expression to get all img src attributes
	xmlXPathObjectPtr xpath_obj
	= xmlXPathEvalExpression ((const xmlChar *) "//img/@src", xpath_ctx);
	if (xpath_obj == NULL)
	{
	fprintf (stderr, "Failed to evaluate XPath expression.\n");
	xmlXPathFreeContext (xpath_ctx);
	xmlFreeDoc (doc);
	xmlCleanupParser ();
	return -1;
	}

	// Process the results
	xmlNodeSetPtr nodes = xpath_obj->nodesetval;
	if (nodes)
	{
	for (int i = 0; i < nodes->nodeNr; i++)
	{
	xmlNodePtr node = nodes->nodeTab[i];
	if (node->type == XML_ATTRIBUTE_NODE)
	{
	xmlChar *src = xmlNodeGetContent (node);
	if (src)
	{
	// Handle different types of URLs if base_url is provided
	if (base_url && !strstr ((char *) src, "://"))
	{
	xmlChar *full_url = NULL;

	if (src[0] == '/')
	{
	// URLs starting with / need the domain part of base_url
	char *domain_end = NULL;
	char domain[1024] = {0};

	// Extract domain from base_url (e.g., https://example.com)
	if (strstr (base_url, "://"))
	{
	const char *proto_end = strstr (base_url, "://") + 3;
	domain_end = strchr (proto_end, '/');

	if (domain_end)
	{
	// Copy only up to the first slash after the domain
	int domain_len = domain_end - base_url;
	strncpy (domain, base_url, domain_len);
	domain[domain_len] = '\0';
	}
	else
	{
	// No slash after domain, use entire base_url
	strcpy (domain, base_url);
	}

	// Construct the full URL using domain + src
	char full_path = (char ) malloc (strlen (domain)
	+ strlen ((char *) src) + 1);
	if (full_path)
	{
	sprintf (full_path, "%s%s", domain, (char *) src);
	add_url (collection, full_path);
	free (full_path);
	}
	}
	else
	{
	add_url (collection, (char *) src);
	}
	}
	else
	{
	// Regular relative URLs (without leading slash)
	full_url = xmlBuildURI (src, (const xmlChar *) base_url);
	if (full_url)
	{
	add_url (collection, (char *) full_url);
	xmlFree (full_url);
	}
	else
	{
	add_url (collection, (char *) src);
	}
	}
	}
	else
	{
	// Absolute URLs or no base_url provided
	add_url (collection, (char *) src);
	}

	xmlFree (src);
	}
	}
	}
	}

	// Cleanup
	xmlXPathFreeObject (xpath_obj);
	xmlXPathFreeContext (xpath_ctx);
	xmlFreeDoc (doc);
	xmlCleanupParser ();

	// Return the actual number of new URLs added
	return collection->count - initial_count;
	}

	// Example usage
	int
	main (int argc, char *argv[])
	{
	// Create a new URL collection
	url_collection_t *collection = create_url_collection (0);
	if (!collection)
	{
	fprintf (stderr, "Failed to create URL collection.\n");
	return 1;
	}

	// Example HTML with various img tags
	const char *html_example
	= "<html><body>"
	"<h1>Sample Page</h1>"
	"<p>Text with <img src='/images/image3.gif'> embedded image</p>"
	"<img src='image1.jpg' alt='Image 1'>"
	"<div><img src='https://example.com/image2.png' alt='Image 2'></div>"
	"<p>Text with <img src='/images/image3.gif'> embedded image</p>"
	"<img src='image1.jpg' alt='Image 1'>"
	"</body></html>";

	const char *base_url = "https://example.com/page/";

	printf ("Extracting image URLs from sample HTML:\n");
	int found = extract_image_urls (collection, html_example, base_url);
	printf ("Found %d unique image URLs\n", found);

	// Print collected URLs
	print_url_collection (collection);

	// Free the URL collection
	free_url_collection (collection);

	return 0;
	}