Created
December 3, 2010 19:21
-
-
Save kzk/727402 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Description | |
Using Groonga as Inverted-Index library. | |
* TODO | |
- Not to store document contents (possible?). | |
- Store documents with scores, and order with scores in the index. | |
- Store documents with sections | |
- Store documents with positions | |
- Separate writer/reader function | |
* Output | |
CreateIndex | |
add | |
add: id=1, str=新垣結衣 | |
add: id=2, str=佐々木希 | |
add: id=3, str=戸田恵梨香 | |
add: id=4, str=安めぐみ | |
add: id=5, str=北川景子 | |
add: id=6, str=深田恭子 | |
search: | |
query: body:@子 | |
nhits: 2 | |
select: id=5 | |
select: id=6 | |
*/ | |
#include <groonga/groonga.h> | |
#include <cassert> | |
#include <iostream> | |
#include <string> | |
#include <vector> | |
using namespace std; | |
class GrnInitializer { | |
public: | |
GrnInitializer() { grn_init(); } | |
~GrnInitializer() { grn_fin(); } | |
}; | |
GrnInitializer grn_initializer; | |
grn_obj* | |
query(grn_ctx *context, grn_obj *docs_table, const string& str) | |
{ | |
grn_obj *expression, *variable; | |
GRN_EXPR_CREATE_FOR_QUERY(context, docs_table, expression, variable); | |
grn_expr_parse(context, expression, | |
str.c_str(), str.size(), | |
NULL, GRN_OP_MATCH, GRN_OP_AND, | |
GRN_EXPR_SYNTAX_QUERY | | |
GRN_EXPR_ALLOW_PRAGMA | | |
GRN_EXPR_ALLOW_COLUMN); | |
return expression; | |
} | |
int | |
CreateIndex(const string& path) | |
{ | |
cerr << "CreateIndex" << endl; | |
grn_obj intbuf; | |
GRN_UINT32_INIT(&intbuf, 0); | |
grn_obj textbuf; | |
GRN_TEXT_INIT(&textbuf, 0); | |
grn_ctx *ctx = new grn_ctx(); | |
grn_ctx_init(ctx, GRN_CTX_USE_QL); | |
GRN_CTX_SET_ENCODING(ctx, GRN_ENC_UTF8); | |
// create database | |
grn_obj *db = grn_db_create(ctx, path.c_str(), NULL); | |
assert(db); | |
grn_ctx_use(ctx, db); | |
// create docs table | |
grn_obj *docs_table = grn_table_create(ctx, "docs", 4, NULL, | |
GRN_OBJ_TABLE_NO_KEY|GRN_OBJ_PERSISTENT, NULL, NULL); | |
assert(docs_table); | |
grn_obj * body_column = grn_column_create(ctx, docs_table, "body", 4, NULL, | |
GRN_OBJ_COLUMN_SCALAR|GRN_OBJ_PERSISTENT, | |
grn_ctx_at(ctx, GRN_DB_TEXT)); | |
assert(body_column); | |
// create lexicon table | |
string table_name = "lexicon"; | |
string table_path = path + ".lexicon"; | |
grn_obj *lexicon_table = grn_table_create(ctx, table_name.c_str(), table_name.size(), | |
table_path.c_str(), | |
GRN_OBJ_PERSISTENT | GRN_OBJ_TABLE_PAT_KEY, | |
grn_ctx_at(ctx, GRN_DB_SHORT_TEXT), NULL); | |
assert(lexicon_table); | |
grn_obj_set_info(ctx, lexicon_table, GRN_INFO_DEFAULT_TOKENIZER, | |
grn_ctx_at(ctx, GRN_DB_BIGRAM)); | |
// create inverted index column on lexicon table | |
string column_name = "index"; | |
string column_path = path + ".ii"; | |
grn_obj *idx_column = grn_column_create(ctx, lexicon_table, | |
column_name.c_str(), column_name.size(), | |
NULL, | |
GRN_OBJ_COLUMN_INDEX | GRN_OBJ_PERSISTENT | GRN_OBJ_WITH_POSITION, | |
docs_table); | |
assert(idx_column); | |
// link body column and inverted index | |
GRN_UINT32_SET(ctx, &intbuf, grn_obj_id(ctx, body_column)); | |
grn_obj_set_info(ctx, idx_column, GRN_INFO_SOURCE, &intbuf); /* need to use grn_id */ | |
// update | |
vector<string> vs; | |
vs.push_back("新垣結衣"); | |
vs.push_back("佐々木希"); | |
vs.push_back("戸田恵梨香"); | |
vs.push_back("安めぐみ"); | |
vs.push_back("北川景子"); | |
vs.push_back("深田恭子"); | |
cerr << "add" << endl; | |
for (unsigned int i = 0; i < vs.size(); i++){ | |
const string& txt = vs[i]; | |
grn_id docid = grn_table_add(ctx, docs_table, NULL, 0, NULL); | |
GRN_TEXT_SETS(ctx, &textbuf, txt.c_str()); | |
grn_obj_set_value(ctx, body_column, docid, &textbuf, GRN_OBJ_SET); | |
cerr << "\tadd: id=" << docid << ", str=" << txt << endl; | |
} | |
// select by expr | |
string q = "body:@子"; | |
grn_obj *res = grn_table_select(ctx, docs_table, query(ctx, docs_table, ("body:@" + q).c_str()), NULL, GRN_OP_OR); | |
assert(res); | |
cerr << "search:" << endl; | |
cerr << "\tquery: " << q << endl; | |
cerr << "\tnhits: " << grn_table_size(ctx, res) << endl; | |
// iterate! | |
grn_id id; | |
int from = 0; | |
int count = 10; | |
grn_table_cursor *cursor = grn_table_cursor_open(ctx, res, NULL, 0, NULL, 0, from, count, 0); | |
while ((id = grn_table_cursor_next(ctx, cursor)) != GRN_ID_NIL) { | |
void *key; | |
int size; | |
size = grn_table_cursor_get_key(ctx, cursor, &key); | |
assert(size == 4); | |
grn_id docid = *((int*)key); | |
cerr << "\tselect: id=" << docid << endl; | |
} | |
grn_table_cursor_close(ctx, cursor); | |
// TODO: too many leaks. fix for production code. | |
} | |
int main() | |
{ | |
system("rm -fR kzk*"); | |
CreateIndex("kzk"); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment