Created
March 25, 2016 22:32
-
-
Save skylander86/46abf91a244fb1b9ff43 to your computer and use it in GitHub Desktop.
Efficient C code for extracting instances from freebase GZip dump
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#define FALSE 0 | |
#define TRUE 1 | |
#define BUFSIZE 1024 | |
int main(void) { | |
setvbuf(stdin, (char *) NULL, _IOFBF, 0); /* better performance if we buffer stdin */ | |
setvbuf(stdout, (char *) NULL, _IOFBF, 0); /* better performance if we buffer */ | |
size_t bufsize = 0; | |
char *linebuf = NULL; | |
char cur_subj[BUFSIZE] = {0}, subj_name[BUFSIZE] = {0}; | |
char subj_is_org = FALSE, subj_have_name = FALSE; | |
size_t subj_name_len = 0; | |
for (size_t line_processed = 1; getline(&linebuf, &bufsize, stdin) > 0; ++line_processed) { | |
/* there are 3,130,753,066 lines in freebase */ | |
if (line_processed % 1000000 == 0) | |
fprintf(stderr, "%ld million lines processed.\n", line_processed / 1000000); | |
if (linebuf[28] != 'm') continue; /* only care about mentions */ | |
char *tok = strtok(linebuf, "\t"); | |
if (strcmp(cur_subj, tok) != 0) { | |
if (subj_is_org && subj_have_name) { | |
/* only want names longer than 2 words */ | |
char have_space = FALSE; | |
for (size_t i = 0; i < subj_name_len; ++i) | |
if (subj_name[i] == ' ') { | |
have_space = TRUE; | |
break; | |
} | |
if (have_space) | |
fprintf(stdout, "%s\n", subj_name); | |
} | |
strcpy(cur_subj, tok); | |
subj_is_org = FALSE; | |
subj_name[0] = '\0'; | |
subj_have_name = FALSE; | |
} | |
char pred[BUFSIZE] = {0}; | |
strcpy(pred, strtok(NULL, "\t")); /* move to next tok and copy */ | |
char *obj = strtok(NULL, "\t"); | |
if (strcmp(pred, "<http://rdf.freebase.com/ns/type.object.type>") == 0 && strcmp(obj, "<http://rdf.freebase.com/ns/organization.organization>") == 0) | |
subj_is_org = TRUE; | |
else if (strcmp(pred, "<http://rdf.freebase.com/ns/type.object.name>") == 0) { | |
size_t len = strlen(obj); | |
if (len > 5 && len < BUFSIZE && strcmp(&obj[len - 3], "@en") == 0) { | |
strcpy(subj_name, &obj[1]); | |
subj_name[len - 5] = '\0'; /* get rid of language attribute at the end */ | |
subj_name_len = len - 5; | |
subj_have_name = TRUE; | |
} | |
} | |
} | |
free(linebuf); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment