Skip to content

Instantly share code, notes, and snippets.

@milesrout
Last active May 29, 2016 05:20
Show Gist options
  • Save milesrout/85d429c27934cfc814e1 to your computer and use it in GitHub Desktop.
Save milesrout/85d429c27934cfc814e1 to your computer and use it in GitHub Desktop.
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <pthread.h>
#include "html.h"
#include "http.h"
#include "url.h"
#include "list.h"
#include "queue.h"
// because strdup is not standard, but is really useful as long as you're
// careful about freeing the memory it allocates.
char *mystrdup(char *str);
// use this to avoid freeing null pointers.
// the local variable is to avoid multiple evaluation of arguments.
#define xfree(ptr) do { void *p = (ptr); if (p) free(p); } while (0)
#define URL_SIZE 256
/*
* url_filename:
* Convert a url into a filename for saving to local disk.
* e.g. http://www.cosc.canterbury.ac.nz/dept/viscom.shtml -> www.cosc.canterbury.ac.nz/dept|viscom.shtml
*/
void url_filename(char *buffer, int size, char *url) {
char path[URL_SIZE];
char host[URL_SIZE];
get_path(path, URL_SIZE, url);
get_host(host, URL_SIZE, url);
if (*path == '\0') {
strncpy(path, "/", URL_SIZE);
}
char *c = path;
while (*c != '\0') {
if (*c == '/') {
*c = '|';
}
++c;
}
snprintf(buffer, size, "%s/%s", host, path);
}
// the 'global' state that each thread should have access to
typedef struct context_ {
Queue *work, *result;
} Context;
typedef struct work_item_ {
char *host, *path;
int depth;
} WorkItem;
typedef struct result_item_ {
char *host, *path;
// FIXME: for binary (add length)
char *headers, *content;
int depth;
} ResultItem;
void put_links(Queue *work_queue, int depth, const char *html,
int *num_waiting)
{
char **links = extract_links(html);
int i;
/* print_string(links); */
for (i = 0; links[i] != NULL; i++) {
char url[URL_SIZE];
char host[URL_SIZE];
char path[URL_SIZE];
add_scheme(url, URL_SIZE, links[i]);
get_host(host, URL_SIZE; url);
get_path(path, URL_SIZE; url);
WorkItem *work_item = malloc(sizeof *work_item);
work_item->host = mystrdup(host);
work_item->path = mystrdup(path);
work_item->depth = depth;
queue_put(work_queue, work_item);
(*num_waiting)++;
}
free_strings(links);
}
void main_func(Context *ctx, WorkItem *work_item)
{
// The number of items that we are still waiting for.
unsigned num_waiting = 0;
// Add the initial bit of work to the queue
queue_put(ctx->work, work_item);
num_waiting++;
while (num_waiting != 0) {
char url[URL_SIZE];
char filename[URL_SIZE];
FILE *f;
ResultItem *result;
result = queue_get(ctx->result);
num_waiting--;
make_absolute(url, URL_SIZE, result->host, result->path);
url_filename(filename, URL_SIZE, url);
// FIXME: for binary ("b" ??)
f = fopen(filename, "w");
// if the result was an error, the file should be empty
if (result->headers && result->content) {
// FIXME: for binary (change to fwrite)
fprintf(f, "%s", result->content);
}
fclose(f);
// FIXME: only look for links in HTML results
// add the content type to the result_item_ struct and check it here.
if (depth > 1 && result->headers && result->content)
put_links(ctx->work, result->depth - 1, result->content, &num_waiting);
xfree(result->headers);
xfree(result->content);
free(result->host);
free(result->path);
free(result);
}
}
void *worker_func(void *ptr)
{
Context *ctx = ptr;
WorkItem *work_item;
while (work_item = queue_get(ctx->work)) {
ResultItem *result = malloc(sizeof *result);
result->host = work_item->host;
result->path = work_item->path;
result->depth = work_item->depth;
result->headers = http_query(result->host, result->path, 80);
result->content = http_split_content(result->headers);
free(work_item);
queue_put(ctx->result, result);
}
}
int main(int argc, char **argv) {
Queue *work_queue, *result_queue;
char url[URL_SIZE], host[URL_SIZE];
int depth, num_workers;
// dynamically allocate these so they can be free'd inside main_func and
// worker_func along with others they create.
Context *ctx = malloc(sizeof *ctx);
WorkItem *work_item = malloc(sizeof *work_item);
if(argc != 4){
fprintf(stderr, "usage: ./crawler url depth num_workers\n");
exit(1);
}
// add http:// if needed - the uriparser doesn't like absolute urls without it
add_scheme(url, URL_SIZE, argv[1]);
// get host - e.g. www.canterbury.ac.nz
get_host(host, URL_SIZE, url);
depth = atoi(argv[2]);
num_workers = atoi(argv[3]);
printf("Crawling %s to depth %d, with %d worker threads", url, depth, num_workers);
work_queue = queue_alloc(num_workers);
result_queue = queue_alloc(num_workers);
ctx->work = work_queue;
ctx->result = result_queue;
work_item->host = mystrdup(host);
work_item->path = mystrdup(url);
work_item->depth = depth;
for (i = 0; i < num_workers; i++)
pthread_create(&threads[i], NULL, worker_func, &ctx);
main_func(&ctx, &work_item);
for (i = 0; i < num_workers; i++)
queue_put(work_queue, NULL);
for (i = 0; i < num_workers; i++)
pthread_join(threads[i], NULL); // ignore result
/*
* the main thread should be taking things off the result queue, getting
* the links from them, then putting those links on the work queue.
* the worker threads should be taking things off the work queue, doing
* the HTTP queries, then putting the HTTP query results on the result
* queue.
* How does the main thread know when it is done? It's done when everything
* that it needs to do is done. It knows that it has nothing to do when it
* has counted as many result items received as work items sent.
* Every time it adds a WorkItem to the work queue, it increments its
* counter. Every time it gets a ResultItem from the result queue, it
* decrements its counter. It loops as long as the counter is non-zero.
* How do the worker threads know when they are done? They will receive a
* NULL message. One of these is sent for each worker when the main thread
* has a zero counter and no work for the worker threads to do.
*/
// Example of possible main function:
// create pages queue
// List *pages = list_new();
// add_page(pages, url, depth);
// create_directory(host);
// spawn threads and create work queue(s)
// Context *context = spawn_workers(num_workers);
//
// crawl(pages, context);
//
// //cleanup
// free_workers(context);
// list_free(pages);
return 0;
}
// because strdup is not standard
char *mystrdup(char *str)
{
size_t len;
char *newstr = NULL;
len = strlen(str);
newstr = malloc(len + 1);
strcpy(newstr, str);
return newstr;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment