Skip to content

Instantly share code, notes, and snippets.

@kzk
Created December 3, 2010 19:21
Show Gist options
  • Save kzk/727402 to your computer and use it in GitHub Desktop.
Save kzk/727402 to your computer and use it in GitHub Desktop.
/*
* Description
Using Groonga as Inverted-Index library.
* TODO
- Not to store document contents (possible?).
- Store documents with scores, and order with scores in the index.
- Store documents with sections
- Store documents with positions
- Separate writer/reader function
* Output
CreateIndex
add
add: id=1, str=新垣結衣
add: id=2, str=佐々木希
add: id=3, str=戸田恵梨香
add: id=4, str=安めぐみ
add: id=5, str=北川景子
add: id=6, str=深田恭子
search:
query: body:@子
nhits: 2
select: id=5
select: id=6
*/
#include <groonga/groonga.h>
#include <cassert>
#include <iostream>
#include <string>
#include <vector>
using namespace std;
class GrnInitializer {
public:
GrnInitializer() { grn_init(); }
~GrnInitializer() { grn_fin(); }
};
GrnInitializer grn_initializer;
grn_obj*
query(grn_ctx *context, grn_obj *docs_table, const string& str)
{
grn_obj *expression, *variable;
GRN_EXPR_CREATE_FOR_QUERY(context, docs_table, expression, variable);
grn_expr_parse(context, expression,
str.c_str(), str.size(),
NULL, GRN_OP_MATCH, GRN_OP_AND,
GRN_EXPR_SYNTAX_QUERY |
GRN_EXPR_ALLOW_PRAGMA |
GRN_EXPR_ALLOW_COLUMN);
return expression;
}
int
CreateIndex(const string& path)
{
cerr << "CreateIndex" << endl;
grn_obj intbuf;
GRN_UINT32_INIT(&intbuf, 0);
grn_obj textbuf;
GRN_TEXT_INIT(&textbuf, 0);
grn_ctx *ctx = new grn_ctx();
grn_ctx_init(ctx, GRN_CTX_USE_QL);
GRN_CTX_SET_ENCODING(ctx, GRN_ENC_UTF8);
// create database
grn_obj *db = grn_db_create(ctx, path.c_str(), NULL);
assert(db);
grn_ctx_use(ctx, db);
// create docs table
grn_obj *docs_table = grn_table_create(ctx, "docs", 4, NULL,
GRN_OBJ_TABLE_NO_KEY|GRN_OBJ_PERSISTENT, NULL, NULL);
assert(docs_table);
grn_obj * body_column = grn_column_create(ctx, docs_table, "body", 4, NULL,
GRN_OBJ_COLUMN_SCALAR|GRN_OBJ_PERSISTENT,
grn_ctx_at(ctx, GRN_DB_TEXT));
assert(body_column);
// create lexicon table
string table_name = "lexicon";
string table_path = path + ".lexicon";
grn_obj *lexicon_table = grn_table_create(ctx, table_name.c_str(), table_name.size(),
table_path.c_str(),
GRN_OBJ_PERSISTENT | GRN_OBJ_TABLE_PAT_KEY,
grn_ctx_at(ctx, GRN_DB_SHORT_TEXT), NULL);
assert(lexicon_table);
grn_obj_set_info(ctx, lexicon_table, GRN_INFO_DEFAULT_TOKENIZER,
grn_ctx_at(ctx, GRN_DB_BIGRAM));
// create inverted index column on lexicon table
string column_name = "index";
string column_path = path + ".ii";
grn_obj *idx_column = grn_column_create(ctx, lexicon_table,
column_name.c_str(), column_name.size(),
NULL,
GRN_OBJ_COLUMN_INDEX | GRN_OBJ_PERSISTENT | GRN_OBJ_WITH_POSITION,
docs_table);
assert(idx_column);
// link body column and inverted index
GRN_UINT32_SET(ctx, &intbuf, grn_obj_id(ctx, body_column));
grn_obj_set_info(ctx, idx_column, GRN_INFO_SOURCE, &intbuf); /* need to use grn_id */
// update
vector<string> vs;
vs.push_back("新垣結衣");
vs.push_back("佐々木希");
vs.push_back("戸田恵梨香");
vs.push_back("安めぐみ");
vs.push_back("北川景子");
vs.push_back("深田恭子");
cerr << "add" << endl;
for (unsigned int i = 0; i < vs.size(); i++){
const string& txt = vs[i];
grn_id docid = grn_table_add(ctx, docs_table, NULL, 0, NULL);
GRN_TEXT_SETS(ctx, &textbuf, txt.c_str());
grn_obj_set_value(ctx, body_column, docid, &textbuf, GRN_OBJ_SET);
cerr << "\tadd: id=" << docid << ", str=" << txt << endl;
}
// select by expr
string q = "body:@子";
grn_obj *res = grn_table_select(ctx, docs_table, query(ctx, docs_table, ("body:@" + q).c_str()), NULL, GRN_OP_OR);
assert(res);
cerr << "search:" << endl;
cerr << "\tquery: " << q << endl;
cerr << "\tnhits: " << grn_table_size(ctx, res) << endl;
// iterate!
grn_id id;
int from = 0;
int count = 10;
grn_table_cursor *cursor = grn_table_cursor_open(ctx, res, NULL, 0, NULL, 0, from, count, 0);
while ((id = grn_table_cursor_next(ctx, cursor)) != GRN_ID_NIL) {
void *key;
int size;
size = grn_table_cursor_get_key(ctx, cursor, &key);
assert(size == 4);
grn_id docid = *((int*)key);
cerr << "\tselect: id=" << docid << endl;
}
grn_table_cursor_close(ctx, cursor);
// TODO: too many leaks. fix for production code.
}
int main()
{
system("rm -fR kzk*");
CreateIndex("kzk");
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment