Last active
May 29, 2016 05:20
-
-
Save milesrout/85d429c27934cfc814e1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <string.h> | |
#include <stdlib.h> | |
#include <sys/types.h> | |
#include <sys/stat.h> | |
#include <unistd.h> | |
#include <pthread.h> | |
#include "html.h" | |
#include "http.h" | |
#include "url.h" | |
#include "list.h" | |
#include "queue.h" | |
// because strdup is not standard, but is really useful as long as you're | |
// careful about freeing the memory it allocates. | |
char *mystrdup(char *str); | |
// use this to avoid freeing null pointers. | |
// the local variable is to avoid multiple evaluation of arguments. | |
#define xfree(ptr) do { void *p = (ptr); if (p) free(p); } while (0) | |
#define URL_SIZE 256 | |
/* | |
* url_filename: | |
* Convert a url into a filename for saving to local disk. | |
* e.g. http://www.cosc.canterbury.ac.nz/dept/viscom.shtml -> www.cosc.canterbury.ac.nz/dept|viscom.shtml | |
*/ | |
void url_filename(char *buffer, int size, char *url) { | |
char path[URL_SIZE]; | |
char host[URL_SIZE]; | |
get_path(path, URL_SIZE, url); | |
get_host(host, URL_SIZE, url); | |
if (*path == '\0') { | |
strncpy(path, "/", URL_SIZE); | |
} | |
char *c = path; | |
while (*c != '\0') { | |
if (*c == '/') { | |
*c = '|'; | |
} | |
++c; | |
} | |
snprintf(buffer, size, "%s/%s", host, path); | |
} | |
// the 'global' state that each thread should have access to | |
typedef struct context_ { | |
Queue *work, *result; | |
} Context; | |
typedef struct work_item_ { | |
char *host, *path; | |
int depth; | |
} WorkItem; | |
typedef struct result_item_ { | |
char *host, *path; | |
// FIXME: for binary (add length) | |
char *headers, *content; | |
int depth; | |
} ResultItem; | |
void put_links(Queue *work_queue, int depth, const char *html, | |
int *num_waiting) | |
{ | |
char **links = extract_links(html); | |
int i; | |
/* print_string(links); */ | |
for (i = 0; links[i] != NULL; i++) { | |
char url[URL_SIZE]; | |
char host[URL_SIZE]; | |
char path[URL_SIZE]; | |
add_scheme(url, URL_SIZE, links[i]); | |
get_host(host, URL_SIZE; url); | |
get_path(path, URL_SIZE; url); | |
WorkItem *work_item = malloc(sizeof *work_item); | |
work_item->host = mystrdup(host); | |
work_item->path = mystrdup(path); | |
work_item->depth = depth; | |
queue_put(work_queue, work_item); | |
(*num_waiting)++; | |
} | |
free_strings(links); | |
} | |
void main_func(Context *ctx, WorkItem *work_item) | |
{ | |
// The number of items that we are still waiting for. | |
unsigned num_waiting = 0; | |
// Add the initial bit of work to the queue | |
queue_put(ctx->work, work_item); | |
num_waiting++; | |
while (num_waiting != 0) { | |
char url[URL_SIZE]; | |
char filename[URL_SIZE]; | |
FILE *f; | |
ResultItem *result; | |
result = queue_get(ctx->result); | |
num_waiting--; | |
make_absolute(url, URL_SIZE, result->host, result->path); | |
url_filename(filename, URL_SIZE, url); | |
// FIXME: for binary ("b" ??) | |
f = fopen(filename, "w"); | |
// if the result was an error, the file should be empty | |
if (result->headers && result->content) { | |
// FIXME: for binary (change to fwrite) | |
fprintf(f, "%s", result->content); | |
} | |
fclose(f); | |
// FIXME: only look for links in HTML results | |
// add the content type to the result_item_ struct and check it here. | |
if (depth > 1 && result->headers && result->content) | |
put_links(ctx->work, result->depth - 1, result->content, &num_waiting); | |
xfree(result->headers); | |
xfree(result->content); | |
free(result->host); | |
free(result->path); | |
free(result); | |
} | |
} | |
void *worker_func(void *ptr) | |
{ | |
Context *ctx = ptr; | |
WorkItem *work_item; | |
while (work_item = queue_get(ctx->work)) { | |
ResultItem *result = malloc(sizeof *result); | |
result->host = work_item->host; | |
result->path = work_item->path; | |
result->depth = work_item->depth; | |
result->headers = http_query(result->host, result->path, 80); | |
result->content = http_split_content(result->headers); | |
free(work_item); | |
queue_put(ctx->result, result); | |
} | |
} | |
int main(int argc, char **argv) { | |
Queue *work_queue, *result_queue; | |
char url[URL_SIZE], host[URL_SIZE]; | |
int depth, num_workers; | |
// dynamically allocate these so they can be free'd inside main_func and | |
// worker_func along with others they create. | |
Context *ctx = malloc(sizeof *ctx); | |
WorkItem *work_item = malloc(sizeof *work_item); | |
if(argc != 4){ | |
fprintf(stderr, "usage: ./crawler url depth num_workers\n"); | |
exit(1); | |
} | |
// add http:// if needed - the uriparser doesn't like absolute urls without it | |
add_scheme(url, URL_SIZE, argv[1]); | |
// get host - e.g. www.canterbury.ac.nz | |
get_host(host, URL_SIZE, url); | |
depth = atoi(argv[2]); | |
num_workers = atoi(argv[3]); | |
printf("Crawling %s to depth %d, with %d worker threads", url, depth, num_workers); | |
work_queue = queue_alloc(num_workers); | |
result_queue = queue_alloc(num_workers); | |
ctx->work = work_queue; | |
ctx->result = result_queue; | |
work_item->host = mystrdup(host); | |
work_item->path = mystrdup(url); | |
work_item->depth = depth; | |
for (i = 0; i < num_workers; i++) | |
pthread_create(&threads[i], NULL, worker_func, &ctx); | |
main_func(&ctx, &work_item); | |
for (i = 0; i < num_workers; i++) | |
queue_put(work_queue, NULL); | |
for (i = 0; i < num_workers; i++) | |
pthread_join(threads[i], NULL); // ignore result | |
/* | |
* the main thread should be taking things off the result queue, getting | |
* the links from them, then putting those links on the work queue. | |
* the worker threads should be taking things off the work queue, doing | |
* the HTTP queries, then putting the HTTP query results on the result | |
* queue. | |
* How does the main thread know when it is done? It's done when everything | |
* that it needs to do is done. It knows that it has nothing to do when it | |
* has counted as many result items received as work items sent. | |
* Every time it adds a WorkItem to the work queue, it increments its | |
* counter. Every time it gets a ResultItem from the result queue, it | |
* decrements its counter. It loops as long as the counter is non-zero. | |
* How do the worker threads know when they are done? They will receive a | |
* NULL message. One of these is sent for each worker when the main thread | |
* has a zero counter and no work for the worker threads to do. | |
*/ | |
// Example of possible main function: | |
// create pages queue | |
// List *pages = list_new(); | |
// add_page(pages, url, depth); | |
// create_directory(host); | |
// spawn threads and create work queue(s) | |
// Context *context = spawn_workers(num_workers); | |
// | |
// crawl(pages, context); | |
// | |
// //cleanup | |
// free_workers(context); | |
// list_free(pages); | |
return 0; | |
} | |
// because strdup is not standard | |
char *mystrdup(char *str) | |
{ | |
size_t len; | |
char *newstr = NULL; | |
len = strlen(str); | |
newstr = malloc(len + 1); | |
strcpy(newstr, str); | |
return newstr; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment