Last active
August 29, 2015 14:08
-
-
Save naoa/8d862028e23e45e23304 to your computer and use it in GitHub Desktop.
gcc src/index_sample.c -o index_sample -Wall -O2 -lgroonga -I/usr/include/groonga
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <groonga.h> | |
#include <groonga/nfkc.h> | |
/* | |
Wikipedia ja 30万件 3.8G | |
real 1m12.745s | |
user 0m58.432s | |
sys 0m1.263s | |
*/ | |
int | |
main(int argc, char **argv) | |
{ | |
grn_ctx ctx; | |
const char *database_path = argv[1]; | |
const char *table_name = argv[2]; | |
const char *index_column_name = argv[3]; | |
grn_obj *db; | |
grn_obj *table; | |
grn_obj *index_column; | |
grn_obj *hash; | |
grn_obj *sorted; | |
grn_init(); | |
grn_ctx_init(&ctx, 0); | |
db = grn_db_open(&ctx, database_path); | |
table = grn_ctx_get(&ctx, | |
table_name, | |
strlen(table_name)); | |
index_column = grn_obj_column(&ctx, | |
table, | |
index_column_name, | |
strlen(index_column_name)); | |
hash = grn_table_create(&ctx, NULL, 0, | |
NULL, | |
GRN_OBJ_TABLE_HASH_KEY, | |
grn_ctx_at(&ctx, GRN_DB_SHORT_TEXT), | |
grn_ctx_at(&ctx, GRN_DB_UINT32)); | |
{ | |
grn_table_cursor *cur; | |
if ((cur = grn_table_cursor_open(&ctx, table, NULL, 0, NULL, 0, 0, -1, | |
GRN_CURSOR_BY_ID))) { | |
grn_id id; | |
while ((id = grn_table_cursor_next(&ctx, cur)) != GRN_ID_NIL) { | |
grn_obj *index_cursor; | |
if ((index_cursor = grn_index_cursor_open(&ctx, cur, index_column, | |
0, -1, GRN_CURSOR_BY_ID))) { | |
grn_posting *posting; | |
grn_id term_id = GRN_ID_NIL; | |
char term[GRN_TABLE_MAX_KEY_SIZE]; | |
int term_length = 0; | |
grn_obj value; | |
GRN_UINT32_INIT(&value, 0); | |
while ((posting = grn_index_cursor_next(&ctx, index_cursor, &term_id))) { | |
term_length = grn_table_get_key(&ctx, | |
table, | |
term_id, | |
term, | |
GRN_TABLE_MAX_KEY_SIZE); | |
if (term_length >= 6) { | |
grn_char_type char_type; | |
char_type = grn_nfkc_char_type((unsigned char *)term); | |
if (char_type == GRN_CHAR_HIRAGANA || char_type == GRN_CHAR_KATAKANA || | |
char_type == GRN_CHAR_KANJI) { | |
grn_id hash_id; | |
hash_id = grn_table_add(&ctx, hash, term, term_length, NULL); | |
if (hash_id) { | |
GRN_BULK_REWIND(&value); | |
grn_obj_get_value(&ctx, hash, hash_id, &value); | |
GRN_UINT32_SET(&ctx, &value, GRN_UINT32_VALUE(&value) + posting->tf); | |
grn_obj_set_value(&ctx, hash, hash_id, &value, GRN_OBJ_SET); | |
} | |
} | |
} | |
} | |
grn_obj_unlink(&ctx, &value); | |
} | |
grn_obj_unlink(&ctx, index_cursor); | |
} | |
} | |
grn_table_cursor_close(&ctx, cur); | |
} | |
{ | |
unsigned int nkeys; | |
grn_table_sort_key *keys; | |
const char *sortby_val = "-_value"; | |
unsigned int sortby_len = strlen("-_value"); | |
int offset = 0; | |
int limit = -1; | |
sorted = grn_table_create(&ctx, NULL, 0, NULL, | |
GRN_OBJ_TABLE_NO_KEY, NULL, hash); | |
keys = grn_table_sort_key_from_str(&ctx, sortby_val, sortby_len, hash, &nkeys); | |
if (keys) { | |
grn_table_sort(&ctx, hash, offset, limit, sorted, keys, nkeys); | |
grn_table_sort_key_close(&ctx, keys, nkeys); | |
} | |
} | |
{ | |
grn_table_cursor *cur; | |
if ((cur = grn_table_cursor_open(&ctx, sorted, NULL, 0, NULL, | |
0, 0, -1, GRN_CURSOR_BY_ID))) { | |
grn_id hash_id; | |
grn_obj value; | |
GRN_UINT32_INIT(&value, 0); | |
while ((hash_id = grn_table_cursor_next(&ctx, cur)) != GRN_ID_NIL) { | |
unsigned int sorted_key; | |
grn_table_get_key(&ctx, sorted, hash_id, &sorted_key, sizeof(unsigned int)); | |
{ | |
char key[GRN_TABLE_MAX_KEY_SIZE]; | |
int key_size; | |
key_size = grn_table_get_key(&ctx, hash, sorted_key, &key, GRN_TABLE_MAX_KEY_SIZE); | |
GRN_BULK_REWIND(&value); | |
grn_obj_get_value(&ctx, hash, sorted_key, &value); | |
printf("%.*s,%d\n", key_size, key, GRN_UINT32_VALUE(&value)); | |
} | |
} | |
grn_obj_unlink(&ctx, &value); | |
} | |
grn_table_cursor_close(&ctx, cur); | |
} | |
grn_obj_unlink(&ctx, hash); | |
grn_obj_unlink(&ctx, index_column); | |
grn_obj_unlink(&ctx, table); | |
grn_obj_unlink(&ctx, sorted); | |
grn_ctx_fin(&ctx); | |
grn_fin(); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment