Created
March 17, 2025 00:59
-
-
Save 7etsuo/c38775fa75d4a3c08ac06b9e0db4ec2d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <libxml/HTMLparser.h> | |
#include <libxml/xpath.h> | |
#include <libxml/uri.h> | |
// Initial hash table size (prime number for better distribution) | |
#define INITIAL_HASH_SIZE 101 | |
#define MAX_LOAD_FACTOR 0.75 | |
int is_prime (int n); | |
// Structure for hash table entries | |
typedef struct hash_entry | |
{ | |
char *url; // URL string | |
struct hash_entry *next; // Next entry in chain | |
} hash_entry_t; | |
// Structure to hold URL collection | |
typedef struct | |
{ | |
char **urls; // Dynamic array of URL strings | |
int count; // Number of URLs in the collection | |
int capacity; // Current capacity of the array | |
hash_entry_t **hash_table; // Hash table for O(1) duplicate checking | |
int hash_size; // Current size of hash table | |
int hash_count; // Number of entries in hash table | |
} url_collection_t; | |
/** | |
* Initialize a new URL collection | |
* Returns a pointer to the collection or NULL on failure | |
*/ | |
url_collection_t * | |
create_url_collection (int initial_capacity) | |
{ | |
// Use a reasonable default if no capacity provided | |
if (initial_capacity <= 0) | |
{ | |
initial_capacity = 8; | |
} | |
// Allocate the collection struct | |
url_collection_t *collection | |
= (url_collection_t *) malloc (sizeof (url_collection_t)); | |
if (!collection) | |
{ | |
fprintf (stderr, "Failed to allocate URL collection.\n"); | |
return NULL; | |
} | |
// Initialize to zero/NULL | |
memset (collection, 0, sizeof (url_collection_t)); | |
// Allocate the URL array | |
collection->urls = (char **) malloc (initial_capacity * sizeof (char *)); | |
if (!collection->urls) | |
{ | |
fprintf (stderr, "Failed to allocate memory for URL array.\n"); | |
free (collection); | |
return NULL; | |
} | |
collection->capacity = initial_capacity; | |
// Initialize hash table | |
collection->hash_table | |
= (hash_entry_t **) calloc (INITIAL_HASH_SIZE, sizeof (hash_entry_t *)); | |
if (!collection->hash_table) | |
{ | |
fprintf (stderr, "Failed to initialize hash table.\n"); | |
free (collection->urls); | |
free (collection); | |
return NULL; | |
} | |
collection->hash_size = INITIAL_HASH_SIZE; | |
return collection; | |
} | |
/** | |
* Simple hash function for strings | |
*/ | |
unsigned int | |
hash_string (const char *str, int table_size) | |
{ | |
unsigned int hash = 0; | |
while (*str) | |
{ | |
hash = hash * 31 + (*str++); | |
} | |
return hash % table_size; | |
} | |
/** | |
* Check if a number is prime (helper for hash table resizing) | |
*/ | |
int | |
is_prime (int n) | |
{ | |
if (n <= 1) | |
return 0; | |
if (n <= 3) | |
return 1; | |
if (n % 2 == 0 || n % 3 == 0) | |
return 0; | |
for (int i = 5; i * i <= n; i += 6) | |
{ | |
if (n % i == 0 || n % (i + 2) == 0) | |
return 0; | |
} | |
return 1; | |
} | |
/** | |
* Check if a URL is already in the hash table | |
* Returns 1 if found, 0 if not found | |
*/ | |
int | |
url_exists (url_collection_t *collection, const char *url) | |
{ | |
if (!collection || !collection->hash_table) | |
{ | |
return 0; | |
} | |
unsigned int hash = hash_string (url, collection->hash_size); | |
// Find linked list at this hash position | |
hash_entry_t *entry = collection->hash_table[hash]; | |
while (entry != NULL) | |
{ | |
// Compare to existing URLs | |
if (strcmp (entry->url, url) == 0) | |
{ | |
return 1; // URL already exists | |
} | |
// Move to next entry in hash chain | |
entry = entry->next; | |
} | |
return 0; // URL not found | |
} | |
/** | |
* Resize hash table when load factor exceeds threshold | |
* Returns 1 on success, 0 on failure | |
*/ | |
int | |
resize_hash_table (url_collection_t *collection) | |
{ | |
int old_size = collection->hash_size; | |
hash_entry_t **old_table = collection->hash_table; | |
// Find next prime number approximately double the current size | |
int new_size = old_size * 2 + 1; | |
while (!is_prime (new_size)) | |
{ | |
new_size += 2; | |
} | |
// Allocate new table | |
hash_entry_t **new_table | |
= (hash_entry_t **) calloc (new_size, sizeof (hash_entry_t *)); | |
if (!new_table) | |
{ | |
fprintf (stderr, "Failed to allocate new hash table\n"); | |
return 0; | |
} | |
// Rehash all entries | |
for (int i = 0; i < old_size; i++) | |
{ | |
hash_entry_t *entry = old_table[i]; | |
while (entry != NULL) | |
{ | |
// Save next pointer before we modify the entry | |
hash_entry_t *next = entry->next; | |
// Insert into new table | |
unsigned int hash = hash_string (entry->url, new_size); | |
entry->next = new_table[hash]; | |
new_table[hash] = entry; | |
entry = next; | |
} | |
} | |
// Update collection with new table | |
collection->hash_table = new_table; | |
collection->hash_size = new_size; | |
// Free the old table (but not entries - they're reused) | |
free (old_table); | |
return 1; | |
} | |
/** | |
* Add URL to hash table for duplicate checking | |
* Returns 1 on success, 0 on failure | |
*/ | |
int | |
add_to_hash (url_collection_t *collection, const char *url) | |
{ | |
// Check if we need to resize the hash table | |
double load_factor | |
= (double) (collection->hash_count + 1) / collection->hash_size; | |
if (load_factor > MAX_LOAD_FACTOR) | |
{ | |
if (!resize_hash_table (collection)) | |
{ | |
return 0; | |
} | |
} | |
// Create new entry | |
hash_entry_t *entry = (hash_entry_t *) malloc (sizeof (hash_entry_t)); | |
if (!entry) | |
{ | |
fprintf (stderr, "Failed to allocate hash entry\n"); | |
return 0; | |
} | |
// Copy the URL | |
entry->url = strdup (url); | |
if (!entry->url) | |
{ | |
fprintf (stderr, "Failed to duplicate URL for hash table\n"); | |
free (entry); | |
return 0; | |
} | |
// Insert at head of chain | |
unsigned int hash = hash_string (url, collection->hash_size); | |
entry->next = collection->hash_table[hash]; | |
collection->hash_table[hash] = entry; | |
collection->hash_count++; | |
return 1; | |
} | |
/** | |
* Add a URL to the collection | |
* The collection will automatically resize if needed | |
* Duplicate URLs will not be added (O(1) lookup) | |
* Returns 1 on success, 0 on failure | |
*/ | |
int | |
add_url (url_collection_t *collection, const char *url) | |
{ | |
if (!collection) | |
{ | |
return 0; | |
} | |
// Check if this URL already exists in our collection | |
if (url_exists (collection, url)) | |
{ | |
printf ("Skipping duplicate URL: %s\n", url); | |
return 1; // Not an error, just a duplicate | |
} | |
// Check if we need to resize the array | |
if (collection->count >= collection->capacity) | |
{ | |
// Double the capacity when resizing (common growth strategy) | |
int new_capacity = collection->capacity * 2; | |
char **new_urls | |
= (char **) realloc (collection->urls, new_capacity * sizeof (char *)); | |
if (new_urls == NULL) | |
{ | |
fprintf (stderr, "Failed to resize URL collection.\n"); | |
return 0; | |
} | |
collection->urls = new_urls; | |
collection->capacity = new_capacity; | |
} | |
// Add the URL to the collection | |
collection->urls[collection->count] = strdup (url); | |
if (collection->urls[collection->count] == NULL) | |
{ | |
fprintf (stderr, "Failed to duplicate URL string.\n"); | |
return 0; | |
} | |
// Add to hash table for O(1) duplicate checking | |
if (!add_to_hash (collection, url)) | |
{ | |
// Failed to add to hash table, free the URL string | |
free (collection->urls[collection->count]); | |
return 0; | |
} | |
collection->count++; | |
printf ("Added URL to collection: %s\n", url); | |
return 1; | |
} | |
/** | |
* Free the URL collection and hash table | |
*/ | |
void | |
free_url_collection (url_collection_t *collection) | |
{ | |
if (!collection) | |
{ | |
return; | |
} | |
// Free URL strings | |
if (collection->urls) | |
{ | |
for (int i = 0; i < collection->count; i++) | |
{ | |
free (collection->urls[i]); | |
} | |
free (collection->urls); | |
} | |
// Free hash table entries | |
if (collection->hash_table) | |
{ | |
for (int i = 0; i < collection->hash_size; i++) | |
{ | |
hash_entry_t *entry = collection->hash_table[i]; | |
while (entry != NULL) | |
{ | |
hash_entry_t *next = entry->next; | |
free (entry->url); | |
free (entry); | |
entry = next; | |
} | |
} | |
free (collection->hash_table); | |
} | |
// Finally free the collection struct itself | |
free (collection); | |
} | |
/** | |
* Print all URLs in the collection | |
*/ | |
void | |
print_url_collection (url_collection_t *collection) | |
{ | |
if (!collection) | |
{ | |
printf ("No collection to print.\n"); | |
return; | |
} | |
printf ("\nCollected Image URLs (%d):\n", collection->count); | |
printf ("------------------------------------------------\n"); | |
for (int i = 0; i < collection->count; i++) | |
{ | |
printf ("[%d] %s\n", i + 1, collection->urls[i]); | |
} | |
printf ("------------------------------------------------\n"); | |
} | |
/** | |
* Process HTML and extract image URLs using XPath | |
* Adds all found URLs to the provided collection | |
* Returns number of URLs found or -1 on error | |
*/ | |
int | |
extract_image_urls (url_collection_t *collection, const char *html, | |
const char *base_url) | |
{ | |
if (!collection || !html) | |
{ | |
fprintf (stderr, "Invalid parameters for extract_image_urls\n"); | |
return -1; | |
} | |
int initial_count = collection->count; | |
// Initialize libxml2 | |
xmlInitParser (); | |
// Parse the HTML string into a DOM document | |
htmlDocPtr doc = htmlReadMemory (html, strlen (html), base_url, NULL, | |
HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); | |
if (doc == NULL) | |
{ | |
fprintf (stderr, "Failed to parse HTML.\n"); | |
xmlCleanupParser (); | |
return -1; | |
} | |
// Create XPath evaluation context | |
xmlXPathContextPtr xpath_ctx = xmlXPathNewContext (doc); | |
if (xpath_ctx == NULL) | |
{ | |
fprintf (stderr, "Failed to create XPath context.\n"); | |
xmlFreeDoc (doc); | |
xmlCleanupParser (); | |
return -1; | |
} | |
// Evaluate XPath expression to get all img src attributes | |
xmlXPathObjectPtr xpath_obj | |
= xmlXPathEvalExpression ((const xmlChar *) "//img/@src", xpath_ctx); | |
if (xpath_obj == NULL) | |
{ | |
fprintf (stderr, "Failed to evaluate XPath expression.\n"); | |
xmlXPathFreeContext (xpath_ctx); | |
xmlFreeDoc (doc); | |
xmlCleanupParser (); | |
return -1; | |
} | |
// Process the results | |
xmlNodeSetPtr nodes = xpath_obj->nodesetval; | |
if (nodes) | |
{ | |
for (int i = 0; i < nodes->nodeNr; i++) | |
{ | |
xmlNodePtr node = nodes->nodeTab[i]; | |
if (node->type == XML_ATTRIBUTE_NODE) | |
{ | |
xmlChar *src = xmlNodeGetContent (node); | |
if (src) | |
{ | |
// Handle different types of URLs if base_url is provided | |
if (base_url && !strstr ((char *) src, "://")) | |
{ | |
xmlChar *full_url = NULL; | |
if (src[0] == '/') | |
{ | |
// URLs starting with / need the domain part of base_url | |
char *domain_end = NULL; | |
char domain[1024] = {0}; | |
// Extract domain from base_url (e.g., https://example.com) | |
if (strstr (base_url, "://")) | |
{ | |
const char *proto_end = strstr (base_url, "://") + 3; | |
domain_end = strchr (proto_end, '/'); | |
if (domain_end) | |
{ | |
// Copy only up to the first slash after the domain | |
int domain_len = domain_end - base_url; | |
strncpy (domain, base_url, domain_len); | |
domain[domain_len] = '\0'; | |
} | |
else | |
{ | |
// No slash after domain, use entire base_url | |
strcpy (domain, base_url); | |
} | |
// Construct the full URL using domain + src | |
char *full_path = (char *) malloc (strlen (domain) | |
+ strlen ((char *) src) + 1); | |
if (full_path) | |
{ | |
sprintf (full_path, "%s%s", domain, (char *) src); | |
add_url (collection, full_path); | |
free (full_path); | |
} | |
} | |
else | |
{ | |
add_url (collection, (char *) src); | |
} | |
} | |
else | |
{ | |
// Regular relative URLs (without leading slash) | |
full_url = xmlBuildURI (src, (const xmlChar *) base_url); | |
if (full_url) | |
{ | |
add_url (collection, (char *) full_url); | |
xmlFree (full_url); | |
} | |
else | |
{ | |
add_url (collection, (char *) src); | |
} | |
} | |
} | |
else | |
{ | |
// Absolute URLs or no base_url provided | |
add_url (collection, (char *) src); | |
} | |
xmlFree (src); | |
} | |
} | |
} | |
} | |
// Cleanup | |
xmlXPathFreeObject (xpath_obj); | |
xmlXPathFreeContext (xpath_ctx); | |
xmlFreeDoc (doc); | |
xmlCleanupParser (); | |
// Return the actual number of new URLs added | |
return collection->count - initial_count; | |
} | |
// Example usage | |
int | |
main (int argc, char *argv[]) | |
{ | |
// Create a new URL collection | |
url_collection_t *collection = create_url_collection (0); | |
if (!collection) | |
{ | |
fprintf (stderr, "Failed to create URL collection.\n"); | |
return 1; | |
} | |
// Example HTML with various img tags | |
const char *html_example | |
= "<html><body>" | |
"<h1>Sample Page</h1>" | |
"<p>Text with <img src='/images/image3.gif'> embedded image</p>" | |
"<img src='image1.jpg' alt='Image 1'>" | |
"<div><img src='https://example.com/image2.png' alt='Image 2'></div>" | |
"<p>Text with <img src='/images/image3.gif'> embedded image</p>" | |
"<img src='image1.jpg' alt='Image 1'>" | |
"</body></html>"; | |
const char *base_url = "https://example.com/page/"; | |
printf ("Extracting image URLs from sample HTML:\n"); | |
int found = extract_image_urls (collection, html_example, base_url); | |
printf ("Found %d unique image URLs\n", found); | |
// Print collected URLs | |
print_url_collection (collection); | |
// Free the URL collection | |
free_url_collection (collection); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment