Created
September 2, 2020 21:41
-
-
Save Tschrock/7c400a36e23826cf60bda29b8db94cd0 to your computer and use it in GitHub Desktop.
A hastily thrown together c program to scrape urls out of files in a directory. I don't write c so feel free to laugh (or cry) at all the things I did wrong.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <unistd.h> | |
#include <ftw.h> | |
#define BUFFER_SIZE (1 * 1024 * 1024) | |
void filterfile(const char *filepath) { | |
unsigned char buffer[BUFFER_SIZE]; | |
FILE * file; | |
char ch; | |
int sec; | |
int index = -1; | |
int x; | |
size_t readChars; | |
// Try opening the file | |
file = fopen(filepath, "rb"); | |
if (file) { | |
do { | |
// read from the file | |
readChars = fread(buffer, sizeof(unsigned char), BUFFER_SIZE, file); | |
// for each char | |
for(x = 0; x < readChars; ++x) { | |
ch = buffer[x]; | |
if(index == -1 && ch == 'h') { | |
index += 1; | |
} | |
else if(index == 0 && ch == 't') { | |
index += 1; | |
} | |
else if(index == 1 && ch == 't') { | |
index += 1; | |
} | |
else if(index == 2 && ch == 'p') { | |
index += 1; | |
} | |
else if(index == 3 && ch == 's') { | |
sec = 1; | |
} | |
else if(index == 3 && ch == ':') { | |
index += 1; | |
} | |
else if(index == 4 && ch == '/') { | |
index += 1; | |
} | |
else if(index == 5 && ch == '/') { | |
putchar('h'); | |
putchar('t'); | |
putchar('t'); | |
putchar('p'); | |
if(sec = 1) { | |
putchar('s'); | |
sec = 0; | |
} | |
putchar(':'); | |
putchar('/'); | |
putchar('/'); | |
index += 1; | |
} | |
else if(index == 6) | |
if ((ch > 34 && ch < 39) || (ch > 39 && ch < 60) || ch == 61 || (ch > 62 && ch < 127)) { | |
putchar(ch); | |
} | |
else { | |
putchar('\n'); | |
index = -1; | |
} | |
else { | |
index = -1; | |
} | |
} | |
} while (readChars == BUFFER_SIZE); | |
// close the file | |
fclose(file); | |
} | |
else { | |
// Todo: log to stderr | |
} | |
} | |
int print_entry(const char *filepath, const struct stat *info, | |
const int typeflag, struct FTW *pathinfo) | |
{ | |
if (typeflag == FTW_F) { | |
filterfile(filepath); | |
} | |
return 0; | |
} | |
int main(int argc, char *argv[]) { | |
if(argc == 2) { | |
return nftw(argv[1], print_entry, 15, FTW_PHYS); | |
} | |
else { | |
printf("Scrapes urls from files in a directory.\n"); | |
printf("Usage: %s <directory>\n", argv[0]); | |
return 0; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
For reference: the goal was to be faster than
grep
. At a 5x speed improvement I think this more than qualifies.