Tschrock · September 2, 2020 21:41 · Tschrock · Sep 2, 2020
diff --git a/urlscraper.c b/urlscraper.c
 #include <stdio.h>
 #include <unistd.h>
 #include <ftw.h>

 #define BUFFER_SIZE (1 * 1024 * 1024)

 void filterfile(const char *filepath) {

    unsigned char buffer[BUFFER_SIZE];
    FILE * file;
    char ch;
    int sec;
    int index = -1;
    int x;
    size_t readChars;

    // Try opening the file
    file = fopen(filepath, "rb");
    if (file) {
        do {
            // read from the file
            readChars = fread(buffer, sizeof(unsigned char), BUFFER_SIZE, file);

            // for each char
            for(x = 0; x < readChars; ++x) {

            ch = buffer[x];

        if(index == -1 && ch == 'h') {
            index += 1;
        }
        else if(index == 0 && ch == 't') {
            index += 1;
        }
        else if(index == 1 && ch == 't') {
            index += 1;
        }
        else if(index == 2 && ch == 'p') {
            index += 1;
        }
        else if(index == 3 && ch == 's') {
            sec = 1;
        }
        else if(index == 3 && ch == ':') {
            index += 1;
        }
        else if(index == 4 && ch == '/') {
            index += 1;
        }
        else if(index == 5 && ch == '/') {
            putchar('h');
            putchar('t');
            putchar('t');
            putchar('p');
            if(sec = 1) {
                putchar('s');
                sec = 0;
            }
            putchar(':');
            putchar('/');
            putchar('/');
            index += 1;
        }
        else if(index == 6)
            if ((ch > 34 && ch < 39) || (ch > 39 && ch < 60) || ch == 61 || (ch > 62 && ch < 127)) {
                putchar(ch);
            }
            else {
                putchar('\n');
                index = -1;
            }
        else {
            index = -1;
        }

            }
        } while (readChars == BUFFER_SIZE);

        // close the file
        fclose(file);
    }
    else {
        // Todo: log to stderr
    }
 }

 int print_entry(const char *filepath, const struct stat *info,
        const int typeflag, struct FTW *pathinfo)
 {
    if (typeflag == FTW_F) {
        filterfile(filepath);
    }
    return 0;
 }

 int main(int argc, char *argv[]) {
    if(argc == 2) {
        return nftw(argv[1], print_entry, 15, FTW_PHYS);
    }
    else {
        printf("Scrapes urls from files in a directory.\n");
        printf("Usage: %s <directory>\n", argv[0]);
        return 0;
    }
 }
	#include <stdio.h>
	#include <unistd.h>
	#include <ftw.h>

	#define BUFFER_SIZE (1 * 1024 * 1024)

	void filterfile(const char *filepath) {

	unsigned char buffer[BUFFER_SIZE];
	FILE * file;
	char ch;
	int sec;
	int index = -1;
	int x;
	size_t readChars;

	// Try opening the file
	file = fopen(filepath, "rb");
	if (file) {
	do {
	// read from the file
	readChars = fread(buffer, sizeof(unsigned char), BUFFER_SIZE, file);

	// for each char
	for(x = 0; x < readChars; ++x) {

	ch = buffer[x];

	if(index == -1 && ch == 'h') {
	index += 1;
	}
	else if(index == 0 && ch == 't') {
	index += 1;
	}
	else if(index == 1 && ch == 't') {
	index += 1;
	}
	else if(index == 2 && ch == 'p') {
	index += 1;
	}
	else if(index == 3 && ch == 's') {
	sec = 1;
	}
	else if(index == 3 && ch == ':') {
	index += 1;
	}
	else if(index == 4 && ch == '/') {
	index += 1;
	}
	else if(index == 5 && ch == '/') {
	putchar('h');
	putchar('t');
	putchar('t');
	putchar('p');
	if(sec = 1) {
	putchar('s');
	sec = 0;
	}
	putchar(':');
	putchar('/');
	putchar('/');
	index += 1;
	}
	else if(index == 6)
	if ((ch > 34 && ch < 39) \|\| (ch > 39 && ch < 60) \|\| ch == 61 \|\| (ch > 62 && ch < 127)) {
	putchar(ch);
	}
	else {
	putchar('\n');
	index = -1;
	}
	else {
	index = -1;
	}

	}
	} while (readChars == BUFFER_SIZE);

	// close the file
	fclose(file);
	}
	else {
	// Todo: log to stderr
	}
	}

	int print_entry(const char filepath, const struct stat info,
	const int typeflag, struct FTW *pathinfo)
	{
	if (typeflag == FTW_F) {
	filterfile(filepath);
	}
	return 0;
	}

	int main(int argc, char *argv[]) {
	if(argc == 2) {
	return nftw(argv[1], print_entry, 15, FTW_PHYS);
	}
	else {
	printf("Scrapes urls from files in a directory.\n");
	printf("Usage: %s <directory>\n", argv[0]);
	return 0;
	}
	}