milesrout · May 29, 2016 05:20
diff --git a/gistfile1.c b/gistfile1.c
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>

 #include <sys/types.h>
 #include <sys/stat.h>
 #include <unistd.h>
 #include <pthread.h>

 #include "html.h"
 #include "http.h"
 #include "url.h"
 #include "list.h"
 #include "queue.h"

 // because strdup is not standard, but is really useful as long as you're
 // careful about freeing the memory it allocates.
 char *mystrdup(char *str);

 // use this to avoid freeing null pointers.
 // the local variable is to avoid multiple evaluation of arguments.
 #define xfree(ptr) do { void *p = (ptr); if (p) free(p); } while (0)

 #define URL_SIZE 256
 /*
 *  url_filename:
 *  Convert a url into a filename for saving to local disk.
 *  e.g. http://www.cosc.canterbury.ac.nz/dept/viscom.shtml  ->  www.cosc.canterbury.ac.nz/dept|viscom.shtml
 */

 void url_filename(char *buffer, int size, char *url) { 
    char path[URL_SIZE];
    char host[URL_SIZE];
    
    get_path(path, URL_SIZE, url);
    get_host(host, URL_SIZE, url);

    if (*path == '\0') {
        strncpy(path, "/", URL_SIZE);
    }
    

    char *c = path;
    while (*c != '\0') {
        if (*c == '/') {
            *c = '|'; 
        }
        
        ++c;
    }
    
    snprintf(buffer, size, "%s/%s", host, path);
 }

 // the 'global' state that each thread should have access to
 typedef struct context_ {
    Queue *work, *result;
 } Context;

 typedef struct work_item_ {
    char *host, *path;
    int depth;
 } WorkItem;

 typedef struct result_item_ {
    char *host, *path;
    // FIXME: for binary (add length)
    char *headers, *content;
    int depth;
 } ResultItem;

 void put_links(Queue *work_queue, int depth, const char *html, 
    int *num_waiting)
 {
    char **links = extract_links(html);
    int i;

    /* print_string(links); */

    for (i = 0; links[i] != NULL; i++) {
        char url[URL_SIZE];
        char host[URL_SIZE];
        char path[URL_SIZE];

        add_scheme(url, URL_SIZE, links[i]);
        get_host(host, URL_SIZE; url);
        get_path(path, URL_SIZE; url);

        WorkItem *work_item = malloc(sizeof *work_item);
        work_item->host = mystrdup(host);
        work_item->path = mystrdup(path);
        work_item->depth = depth;

        queue_put(work_queue, work_item);
        (*num_waiting)++;
    }

    free_strings(links);
 }

 void main_func(Context *ctx, WorkItem *work_item)
 {
    // The number of items that we are still waiting for.
    unsigned num_waiting = 0;

    // Add the initial bit of work to the queue
    queue_put(ctx->work, work_item);
    num_waiting++;

    while (num_waiting != 0) {
        char url[URL_SIZE];
        char filename[URL_SIZE];
        FILE *f;
        ResultItem *result;
        
        result = queue_get(ctx->result);
        num_waiting--;

        make_absolute(url, URL_SIZE, result->host, result->path);
        url_filename(filename, URL_SIZE, url);

        // FIXME: for binary ("b" ??)
        f = fopen(filename, "w");

        // if the result was an error, the file should be empty
        if (result->headers && result->content) {
            // FIXME: for binary (change to fwrite)
            fprintf(f, "%s", result->content);
        }
        fclose(f);

        // FIXME: only look for links in HTML results
        // add the content type to the result_item_ struct and check it here.
        if (depth > 1 && result->headers && result->content)
            put_links(ctx->work, result->depth - 1, result->content, &num_waiting);

        xfree(result->headers);
        xfree(result->content);
        free(result->host);
        free(result->path);
        free(result);
    }
 }

 void *worker_func(void *ptr)
 {
    Context *ctx = ptr;
    WorkItem *work_item;

    while (work_item = queue_get(ctx->work)) {
        ResultItem *result = malloc(sizeof *result);

        result->host = work_item->host;
        result->path = work_item->path;
        result->depth = work_item->depth;
        result->headers = http_query(result->host, result->path, 80);
        result->content = http_split_content(result->headers);

        free(work_item);

        queue_put(ctx->result, result);
    }
 }



 int main(int argc, char **argv) {
    Queue *work_queue, *result_queue;
    char url[URL_SIZE], host[URL_SIZE];
    int depth, num_workers;

    // dynamically allocate these so they can be free'd inside main_func and
    // worker_func along with others they create.
    Context *ctx = malloc(sizeof *ctx);
    WorkItem *work_item = malloc(sizeof *work_item);
    
    if(argc != 4){
        fprintf(stderr, "usage: ./crawler url depth num_workers\n");
        exit(1);
    }  
    
    // add http:// if needed - the uriparser doesn't like absolute urls without it
    add_scheme(url, URL_SIZE, argv[1]);
        
    // get host - e.g. www.canterbury.ac.nz
    get_host(host, URL_SIZE, url);  
    
    depth = atoi(argv[2]);  
    num_workers = atoi(argv[3]);  

    printf("Crawling %s to depth %d, with %d worker threads", url, depth, num_workers);

    work_queue = queue_alloc(num_workers);
    result_queue = queue_alloc(num_workers);
    ctx->work = work_queue;
    ctx->result = result_queue;
    work_item->host = mystrdup(host);
    work_item->path = mystrdup(url);
    work_item->depth = depth;

    for (i = 0; i < num_workers; i++)
        pthread_create(&threads[i], NULL, worker_func, &ctx);

    main_func(&ctx, &work_item);

    for (i = 0; i < num_workers; i++)
        queue_put(work_queue, NULL);

    for (i = 0; i < num_workers; i++)
        pthread_join(threads[i], NULL); // ignore result

    /*
     * the main thread should be taking things off the result queue, getting
     * the links from them, then putting those links on the work queue. 
     
     * the worker threads should be taking things off the work queue, doing
     * the HTTP queries, then putting the HTTP query results on the result
     * queue. 
     
     * How does the main thread know when it is done? It's done when everything
     * that it needs to do is done. It knows that it has nothing to do when it
     * has counted as many result items received as work items sent. 
     
     * Every time it adds a WorkItem to the work queue, it increments its
     * counter. Every time it gets a ResultItem from the result queue, it
     * decrements its counter. It loops as long as the counter is non-zero.
     
     * How do the worker threads know when they are done? They will receive a
     * NULL message. One of these is sent for each worker when the main thread
     * has a zero counter and no work for the worker threads to do.
     */

    // Example of possible main function:  
        
    //   create pages queue
    //   List *pages = list_new();
    //   add_page(pages, url, depth);
        
    //   create_directory(host);
        
    //   spawn threads and create work queue(s)
    //   Context *context = spawn_workers(num_workers);
    //   
    //   crawl(pages, context);
    //   
    //   //cleanup
    //   free_workers(context);
    //   list_free(pages);
    
    return 0;  
 }

 // because strdup is not standard
 char *mystrdup(char *str)
 {
    size_t len;
    char *newstr = NULL;

    len = strlen(str);
    newstr = malloc(len + 1);
    strcpy(newstr, str);

    return newstr;
 }
	#include <stdio.h>
	#include <string.h>
	#include <stdlib.h>

	#include <sys/types.h>
	#include <sys/stat.h>
	#include <unistd.h>
	#include <pthread.h>

	#include "html.h"
	#include "http.h"
	#include "url.h"
	#include "list.h"
	#include "queue.h"

	// because strdup is not standard, but is really useful as long as you're
	// careful about freeing the memory it allocates.
	char mystrdup(char str);

	// use this to avoid freeing null pointers.
	// the local variable is to avoid multiple evaluation of arguments.
	#define xfree(ptr) do { void *p = (ptr); if (p) free(p); } while (0)

	#define URL_SIZE 256
	/*
	* url_filename:
	* Convert a url into a filename for saving to local disk.
	* e.g. http://www.cosc.canterbury.ac.nz/dept/viscom.shtml -> www.cosc.canterbury.ac.nz/dept\|viscom.shtml
	*/

	void url_filename(char buffer, int size, char url) {
	char path[URL_SIZE];
	char host[URL_SIZE];

	get_path(path, URL_SIZE, url);
	get_host(host, URL_SIZE, url);

	if (*path == '\0') {
	strncpy(path, "/", URL_SIZE);
	}


	char *c = path;
	while (*c != '\0') {
	if (*c == '/') {
	*c = '\|';
	}

	++c;
	}

	snprintf(buffer, size, "%s/%s", host, path);
	}

	// the 'global' state that each thread should have access to
	typedef struct context_ {
	Queue work, result;
	} Context;

	typedef struct work_item_ {
	char host, path;
	int depth;
	} WorkItem;

	typedef struct result_item_ {
	char host, path;
	// FIXME: for binary (add length)
	char headers, content;
	int depth;
	} ResultItem;

	void put_links(Queue work_queue, int depth, const char html,
	int *num_waiting)
	{
	char **links = extract_links(html);
	int i;

	/* print_string(links); */

	for (i = 0; links[i] != NULL; i++) {
	char url[URL_SIZE];
	char host[URL_SIZE];
	char path[URL_SIZE];

	add_scheme(url, URL_SIZE, links[i]);
	get_host(host, URL_SIZE; url);
	get_path(path, URL_SIZE; url);

	WorkItem work_item = malloc(sizeof work_item);
	work_item->host = mystrdup(host);
	work_item->path = mystrdup(path);
	work_item->depth = depth;

	queue_put(work_queue, work_item);
	(*num_waiting)++;
	}

	free_strings(links);
	}

	void main_func(Context ctx, WorkItem work_item)
	{
	// The number of items that we are still waiting for.
	unsigned num_waiting = 0;

	// Add the initial bit of work to the queue
	queue_put(ctx->work, work_item);
	num_waiting++;

	while (num_waiting != 0) {
	char url[URL_SIZE];
	char filename[URL_SIZE];
	FILE *f;
	ResultItem *result;

	result = queue_get(ctx->result);
	num_waiting--;

	make_absolute(url, URL_SIZE, result->host, result->path);
	url_filename(filename, URL_SIZE, url);

	// FIXME: for binary ("b" ??)
	f = fopen(filename, "w");

	// if the result was an error, the file should be empty
	if (result->headers && result->content) {
	// FIXME: for binary (change to fwrite)
	fprintf(f, "%s", result->content);
	}
	fclose(f);

	// FIXME: only look for links in HTML results
	// add the content type to the result_item_ struct and check it here.
	if (depth > 1 && result->headers && result->content)
	put_links(ctx->work, result->depth - 1, result->content, &num_waiting);

	xfree(result->headers);
	xfree(result->content);
	free(result->host);
	free(result->path);
	free(result);
	}
	}

	void worker_func(void ptr)
	{
	Context *ctx = ptr;
	WorkItem *work_item;

	while (work_item = queue_get(ctx->work)) {
	ResultItem result = malloc(sizeof result);

	result->host = work_item->host;
	result->path = work_item->path;
	result->depth = work_item->depth;
	result->headers = http_query(result->host, result->path, 80);
	result->content = http_split_content(result->headers);

	free(work_item);

	queue_put(ctx->result, result);
	}
	}



	int main(int argc, char **argv) {
	Queue work_queue, result_queue;
	char url[URL_SIZE], host[URL_SIZE];
	int depth, num_workers;

	// dynamically allocate these so they can be free'd inside main_func and
	// worker_func along with others they create.
	Context ctx = malloc(sizeof ctx);
	WorkItem work_item = malloc(sizeof work_item);

	if(argc != 4){
	fprintf(stderr, "usage: ./crawler url depth num_workers\n");
	exit(1);
	}

	// add http:// if needed - the uriparser doesn't like absolute urls without it
	add_scheme(url, URL_SIZE, argv[1]);

	// get host - e.g. www.canterbury.ac.nz
	get_host(host, URL_SIZE, url);

	depth = atoi(argv[2]);
	num_workers = atoi(argv[3]);

	printf("Crawling %s to depth %d, with %d worker threads", url, depth, num_workers);

	work_queue = queue_alloc(num_workers);
	result_queue = queue_alloc(num_workers);
	ctx->work = work_queue;
	ctx->result = result_queue;
	work_item->host = mystrdup(host);
	work_item->path = mystrdup(url);
	work_item->depth = depth;

	for (i = 0; i < num_workers; i++)
	pthread_create(&threads[i], NULL, worker_func, &ctx);

	main_func(&ctx, &work_item);

	for (i = 0; i < num_workers; i++)
	queue_put(work_queue, NULL);

	for (i = 0; i < num_workers; i++)
	pthread_join(threads[i], NULL); // ignore result

	/*
	* the main thread should be taking things off the result queue, getting
	* the links from them, then putting those links on the work queue.

	* the worker threads should be taking things off the work queue, doing
	* the HTTP queries, then putting the HTTP query results on the result
	* queue.

	* How does the main thread know when it is done? It's done when everything
	* that it needs to do is done. It knows that it has nothing to do when it
	* has counted as many result items received as work items sent.

	* Every time it adds a WorkItem to the work queue, it increments its
	* counter. Every time it gets a ResultItem from the result queue, it
	* decrements its counter. It loops as long as the counter is non-zero.

	* How do the worker threads know when they are done? They will receive a
	* NULL message. One of these is sent for each worker when the main thread
	* has a zero counter and no work for the worker threads to do.
	*/

	// Example of possible main function:

	// create pages queue
	// List *pages = list_new();
	// add_page(pages, url, depth);

	// create_directory(host);

	// spawn threads and create work queue(s)
	// Context *context = spawn_workers(num_workers);
	//
	// crawl(pages, context);
	//
	// //cleanup
	// free_workers(context);
	// list_free(pages);

	return 0;
	}

	// because strdup is not standard
	char mystrdup(char str)
	{
	size_t len;
	char *newstr = NULL;

	len = strlen(str);
	newstr = malloc(len + 1);
	strcpy(newstr, str);

	return newstr;
	}