Skip to content

Instantly share code, notes, and snippets.

@7etsuo
Created March 17, 2025 00:59
Show Gist options
  • Save 7etsuo/c38775fa75d4a3c08ac06b9e0db4ec2d to your computer and use it in GitHub Desktop.
Save 7etsuo/c38775fa75d4a3c08ac06b9e0db4ec2d to your computer and use it in GitHub Desktop.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <libxml/HTMLparser.h>
#include <libxml/xpath.h>
#include <libxml/uri.h>
// Initial hash table size (prime number for better distribution)
#define INITIAL_HASH_SIZE 101
#define MAX_LOAD_FACTOR 0.75
int is_prime (int n);
// Structure for hash table entries
typedef struct hash_entry
{
char *url; // URL string
struct hash_entry *next; // Next entry in chain
} hash_entry_t;
// Structure to hold URL collection
typedef struct
{
char **urls; // Dynamic array of URL strings
int count; // Number of URLs in the collection
int capacity; // Current capacity of the array
hash_entry_t **hash_table; // Hash table for O(1) duplicate checking
int hash_size; // Current size of hash table
int hash_count; // Number of entries in hash table
} url_collection_t;
/**
* Initialize a new URL collection
* Returns a pointer to the collection or NULL on failure
*/
url_collection_t *
create_url_collection (int initial_capacity)
{
// Use a reasonable default if no capacity provided
if (initial_capacity <= 0)
{
initial_capacity = 8;
}
// Allocate the collection struct
url_collection_t *collection
= (url_collection_t *) malloc (sizeof (url_collection_t));
if (!collection)
{
fprintf (stderr, "Failed to allocate URL collection.\n");
return NULL;
}
// Initialize to zero/NULL
memset (collection, 0, sizeof (url_collection_t));
// Allocate the URL array
collection->urls = (char **) malloc (initial_capacity * sizeof (char *));
if (!collection->urls)
{
fprintf (stderr, "Failed to allocate memory for URL array.\n");
free (collection);
return NULL;
}
collection->capacity = initial_capacity;
// Initialize hash table
collection->hash_table
= (hash_entry_t **) calloc (INITIAL_HASH_SIZE, sizeof (hash_entry_t *));
if (!collection->hash_table)
{
fprintf (stderr, "Failed to initialize hash table.\n");
free (collection->urls);
free (collection);
return NULL;
}
collection->hash_size = INITIAL_HASH_SIZE;
return collection;
}
/**
* Simple hash function for strings
*/
unsigned int
hash_string (const char *str, int table_size)
{
unsigned int hash = 0;
while (*str)
{
hash = hash * 31 + (*str++);
}
return hash % table_size;
}
/**
* Check if a number is prime (helper for hash table resizing)
*/
int
is_prime (int n)
{
if (n <= 1)
return 0;
if (n <= 3)
return 1;
if (n % 2 == 0 || n % 3 == 0)
return 0;
for (int i = 5; i * i <= n; i += 6)
{
if (n % i == 0 || n % (i + 2) == 0)
return 0;
}
return 1;
}
/**
* Check if a URL is already in the hash table
* Returns 1 if found, 0 if not found
*/
int
url_exists (url_collection_t *collection, const char *url)
{
if (!collection || !collection->hash_table)
{
return 0;
}
unsigned int hash = hash_string (url, collection->hash_size);
// Find linked list at this hash position
hash_entry_t *entry = collection->hash_table[hash];
while (entry != NULL)
{
// Compare to existing URLs
if (strcmp (entry->url, url) == 0)
{
return 1; // URL already exists
}
// Move to next entry in hash chain
entry = entry->next;
}
return 0; // URL not found
}
/**
* Resize hash table when load factor exceeds threshold
* Returns 1 on success, 0 on failure
*/
int
resize_hash_table (url_collection_t *collection)
{
int old_size = collection->hash_size;
hash_entry_t **old_table = collection->hash_table;
// Find next prime number approximately double the current size
int new_size = old_size * 2 + 1;
while (!is_prime (new_size))
{
new_size += 2;
}
// Allocate new table
hash_entry_t **new_table
= (hash_entry_t **) calloc (new_size, sizeof (hash_entry_t *));
if (!new_table)
{
fprintf (stderr, "Failed to allocate new hash table\n");
return 0;
}
// Rehash all entries
for (int i = 0; i < old_size; i++)
{
hash_entry_t *entry = old_table[i];
while (entry != NULL)
{
// Save next pointer before we modify the entry
hash_entry_t *next = entry->next;
// Insert into new table
unsigned int hash = hash_string (entry->url, new_size);
entry->next = new_table[hash];
new_table[hash] = entry;
entry = next;
}
}
// Update collection with new table
collection->hash_table = new_table;
collection->hash_size = new_size;
// Free the old table (but not entries - they're reused)
free (old_table);
return 1;
}
/**
* Add URL to hash table for duplicate checking
* Returns 1 on success, 0 on failure
*/
int
add_to_hash (url_collection_t *collection, const char *url)
{
// Check if we need to resize the hash table
double load_factor
= (double) (collection->hash_count + 1) / collection->hash_size;
if (load_factor > MAX_LOAD_FACTOR)
{
if (!resize_hash_table (collection))
{
return 0;
}
}
// Create new entry
hash_entry_t *entry = (hash_entry_t *) malloc (sizeof (hash_entry_t));
if (!entry)
{
fprintf (stderr, "Failed to allocate hash entry\n");
return 0;
}
// Copy the URL
entry->url = strdup (url);
if (!entry->url)
{
fprintf (stderr, "Failed to duplicate URL for hash table\n");
free (entry);
return 0;
}
// Insert at head of chain
unsigned int hash = hash_string (url, collection->hash_size);
entry->next = collection->hash_table[hash];
collection->hash_table[hash] = entry;
collection->hash_count++;
return 1;
}
/**
* Add a URL to the collection
* The collection will automatically resize if needed
* Duplicate URLs will not be added (O(1) lookup)
* Returns 1 on success, 0 on failure
*/
int
add_url (url_collection_t *collection, const char *url)
{
if (!collection)
{
return 0;
}
// Check if this URL already exists in our collection
if (url_exists (collection, url))
{
printf ("Skipping duplicate URL: %s\n", url);
return 1; // Not an error, just a duplicate
}
// Check if we need to resize the array
if (collection->count >= collection->capacity)
{
// Double the capacity when resizing (common growth strategy)
int new_capacity = collection->capacity * 2;
char **new_urls
= (char **) realloc (collection->urls, new_capacity * sizeof (char *));
if (new_urls == NULL)
{
fprintf (stderr, "Failed to resize URL collection.\n");
return 0;
}
collection->urls = new_urls;
collection->capacity = new_capacity;
}
// Add the URL to the collection
collection->urls[collection->count] = strdup (url);
if (collection->urls[collection->count] == NULL)
{
fprintf (stderr, "Failed to duplicate URL string.\n");
return 0;
}
// Add to hash table for O(1) duplicate checking
if (!add_to_hash (collection, url))
{
// Failed to add to hash table, free the URL string
free (collection->urls[collection->count]);
return 0;
}
collection->count++;
printf ("Added URL to collection: %s\n", url);
return 1;
}
/**
* Free the URL collection and hash table
*/
void
free_url_collection (url_collection_t *collection)
{
if (!collection)
{
return;
}
// Free URL strings
if (collection->urls)
{
for (int i = 0; i < collection->count; i++)
{
free (collection->urls[i]);
}
free (collection->urls);
}
// Free hash table entries
if (collection->hash_table)
{
for (int i = 0; i < collection->hash_size; i++)
{
hash_entry_t *entry = collection->hash_table[i];
while (entry != NULL)
{
hash_entry_t *next = entry->next;
free (entry->url);
free (entry);
entry = next;
}
}
free (collection->hash_table);
}
// Finally free the collection struct itself
free (collection);
}
/**
* Print all URLs in the collection
*/
void
print_url_collection (url_collection_t *collection)
{
if (!collection)
{
printf ("No collection to print.\n");
return;
}
printf ("\nCollected Image URLs (%d):\n", collection->count);
printf ("------------------------------------------------\n");
for (int i = 0; i < collection->count; i++)
{
printf ("[%d] %s\n", i + 1, collection->urls[i]);
}
printf ("------------------------------------------------\n");
}
/**
* Process HTML and extract image URLs using XPath
* Adds all found URLs to the provided collection
* Returns number of URLs found or -1 on error
*/
int
extract_image_urls (url_collection_t *collection, const char *html,
const char *base_url)
{
if (!collection || !html)
{
fprintf (stderr, "Invalid parameters for extract_image_urls\n");
return -1;
}
int initial_count = collection->count;
// Initialize libxml2
xmlInitParser ();
// Parse the HTML string into a DOM document
htmlDocPtr doc = htmlReadMemory (html, strlen (html), base_url, NULL,
HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
if (doc == NULL)
{
fprintf (stderr, "Failed to parse HTML.\n");
xmlCleanupParser ();
return -1;
}
// Create XPath evaluation context
xmlXPathContextPtr xpath_ctx = xmlXPathNewContext (doc);
if (xpath_ctx == NULL)
{
fprintf (stderr, "Failed to create XPath context.\n");
xmlFreeDoc (doc);
xmlCleanupParser ();
return -1;
}
// Evaluate XPath expression to get all img src attributes
xmlXPathObjectPtr xpath_obj
= xmlXPathEvalExpression ((const xmlChar *) "//img/@src", xpath_ctx);
if (xpath_obj == NULL)
{
fprintf (stderr, "Failed to evaluate XPath expression.\n");
xmlXPathFreeContext (xpath_ctx);
xmlFreeDoc (doc);
xmlCleanupParser ();
return -1;
}
// Process the results
xmlNodeSetPtr nodes = xpath_obj->nodesetval;
if (nodes)
{
for (int i = 0; i < nodes->nodeNr; i++)
{
xmlNodePtr node = nodes->nodeTab[i];
if (node->type == XML_ATTRIBUTE_NODE)
{
xmlChar *src = xmlNodeGetContent (node);
if (src)
{
// Handle different types of URLs if base_url is provided
if (base_url && !strstr ((char *) src, "://"))
{
xmlChar *full_url = NULL;
if (src[0] == '/')
{
// URLs starting with / need the domain part of base_url
char *domain_end = NULL;
char domain[1024] = {0};
// Extract domain from base_url (e.g., https://example.com)
if (strstr (base_url, "://"))
{
const char *proto_end = strstr (base_url, "://") + 3;
domain_end = strchr (proto_end, '/');
if (domain_end)
{
// Copy only up to the first slash after the domain
int domain_len = domain_end - base_url;
strncpy (domain, base_url, domain_len);
domain[domain_len] = '\0';
}
else
{
// No slash after domain, use entire base_url
strcpy (domain, base_url);
}
// Construct the full URL using domain + src
char *full_path = (char *) malloc (strlen (domain)
+ strlen ((char *) src) + 1);
if (full_path)
{
sprintf (full_path, "%s%s", domain, (char *) src);
add_url (collection, full_path);
free (full_path);
}
}
else
{
add_url (collection, (char *) src);
}
}
else
{
// Regular relative URLs (without leading slash)
full_url = xmlBuildURI (src, (const xmlChar *) base_url);
if (full_url)
{
add_url (collection, (char *) full_url);
xmlFree (full_url);
}
else
{
add_url (collection, (char *) src);
}
}
}
else
{
// Absolute URLs or no base_url provided
add_url (collection, (char *) src);
}
xmlFree (src);
}
}
}
}
// Cleanup
xmlXPathFreeObject (xpath_obj);
xmlXPathFreeContext (xpath_ctx);
xmlFreeDoc (doc);
xmlCleanupParser ();
// Return the actual number of new URLs added
return collection->count - initial_count;
}
// Example usage
int
main (int argc, char *argv[])
{
// Create a new URL collection
url_collection_t *collection = create_url_collection (0);
if (!collection)
{
fprintf (stderr, "Failed to create URL collection.\n");
return 1;
}
// Example HTML with various img tags
const char *html_example
= "<html><body>"
"<h1>Sample Page</h1>"
"<p>Text with <img src='/images/image3.gif'> embedded image</p>"
"<img src='image1.jpg' alt='Image 1'>"
"<div><img src='https://example.com/image2.png' alt='Image 2'></div>"
"<p>Text with <img src='/images/image3.gif'> embedded image</p>"
"<img src='image1.jpg' alt='Image 1'>"
"</body></html>";
const char *base_url = "https://example.com/page/";
printf ("Extracting image URLs from sample HTML:\n");
int found = extract_image_urls (collection, html_example, base_url);
printf ("Found %d unique image URLs\n", found);
// Print collected URLs
print_url_collection (collection);
// Free the URL collection
free_url_collection (collection);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment