Last active
August 29, 2015 14:10
-
-
Save xqyww123/4f34aa48fe74f0215949 to your computer and use it in GitHub Desktop.
A patch to fix a bug in libmmseg for ruby
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <fstream> | |
#include <string> | |
#include <iostream> | |
#include <cstdio> | |
#include <ctype.h> | |
#include <ruby.h> | |
/* Ruby 1.7 defines NUM2LL(), LL2NUM() and ULL2NUM() macros */ | |
#ifndef NUM2LL | |
#define NUM2LL(x) NUM2LONG((x)) | |
#endif | |
#ifndef LL2NUM | |
#define LL2NUM(x) INT2NUM((long) (x)) | |
#endif | |
#ifndef ULL2NUM | |
#define ULL2NUM(x) UINT2NUM((unsigned long) (x)) | |
#endif | |
/* Ruby 1.7 doesn't (yet) define NUM2ULL() */ | |
#ifndef NUM2ULL | |
#ifdef HAVE_LONG_LONG | |
#define NUM2ULL(x) rb_num2ull((x)) | |
#else | |
#define NUM2ULL(x) NUM2ULONG(x) | |
#endif | |
#endif | |
/* RSTRING_LEN, etc are new in Ruby 1.9, but ->ptr and ->len no longer work */ | |
/* Define these for older versions so we can just write code the new way */ | |
#ifndef RSTRING_LEN | |
# define RSTRING_LEN(x) RSTRING(x)->len | |
#endif | |
#ifndef RSTRING_PTR | |
# define RSTRING_PTR(x) RSTRING(x)->ptr | |
#endif | |
#ifndef RARRAY_LEN | |
# define RARRAY_LEN(x) RARRAY(x)->len | |
#endif | |
#ifndef RARRAY_PTR | |
# define RARRAY_PTR(x) RARRAY(x)->ptr | |
#endif | |
#include <stdio.h> | |
#include <stdexcept> | |
/* calling conventions for Windows */ | |
#ifndef SWIGSTDCALL | |
# if defined(_WIN32) || defined(__WIN32__) || defined(__CYGWIN__) | |
# define SWIGSTDCALL __stdcall | |
# else | |
# define SWIGSTDCALL | |
# endif | |
#endif | |
#include "SegmenterManager.h" | |
#include "Segmenter.h" | |
#ifdef __cplusplus | |
extern "C" { | |
#endif | |
//fixme, unload when so unload? | |
css::SegmenterManager g_mgr; | |
int g_bInited = 0; | |
static void | |
mmseg_dfree | |
(void *cd) | |
{ | |
//printf("needs to clean up"); | |
} | |
#define MMSEG_FREE mmseg_dfree | |
static VALUE | |
mmseg_free | |
(VALUE cd) | |
{ | |
//do free here. | |
return Qnil; | |
} | |
static VALUE | |
check_mmseg | |
(VALUE obj) | |
{ | |
Check_Type(obj, T_DATA); | |
if (RDATA(obj)->dfree != MMSEG_FREE) { | |
rb_raise(rb_eArgError, "mmseg expected (%s)", rb_class2name(CLASS_OF(obj))); | |
} | |
return (VALUE)DATA_PTR(obj); | |
} | |
static VALUE | |
mmseg_s_allocate | |
(VALUE klass) | |
{ | |
return Data_Wrap_Struct(klass, 0, MMSEG_FREE, 0); | |
} | |
static VALUE | |
mmseg_initialize(VALUE self){ | |
mmseg_free(check_mmseg(self)); | |
DATA_PTR(self) = NULL; | |
rb_iv_set(self, "@start", INT2NUM(0)); | |
rb_iv_set(self, "@end", INT2NUM(0)); | |
rb_iv_set(self, "@ri", INT2NUM(0)); | |
return self; | |
} | |
static VALUE mmseg_next(VALUE self) | |
{ | |
u2 tok_len = 0; | |
int nPos = 0, ri; | |
css::Segmenter* seg = NULL; | |
VALUE v_str = rb_iv_get(self, "@str"); | |
char* str = StringValuePtr(v_str); | |
Data_Get_Struct(self, css::Segmenter, seg); | |
//printf("%d",seg); //check is got it | |
if(seg){ | |
u2 len = 0, symlen = 0; | |
char* tok = (char*)seg->peekToken(len,symlen); | |
//printf("%s\t",tok); | |
//printf("xxx %d : %d -- %d\n", len, symlen, tok); | |
//FIXME: if ruby version do not enbale symlen, the len and symlen always the same. | |
if(!tok || !*tok || !len) | |
tok_len = 0; | |
else | |
tok_len = len; | |
seg->popToken(len); | |
} | |
//update position info | |
VALUE vPos = rb_iv_get(self, "@end"); | |
if(!NIL_P(vPos)){ | |
nPos = FIX2INT(vPos); | |
} | |
VALUE v_ri = rb_iv_get(self, "@ri"); | |
if (NIL_P(v_ri)) ri = 0; | |
else ri = FIX2INT(v_ri); | |
rb_iv_set(self, "@ri", INT2NUM(ri + tok_len)); | |
int rlen = 0, i; | |
for (i=ri;i<ri+tok_len;++i) | |
if ((str[i] & 0xc0) != 0x80) | |
rlen++; | |
rb_iv_set(self, "@start", INT2NUM(nPos)); | |
rb_iv_set(self, "@end", INT2NUM(nPos+rlen)); | |
if(tok_len) | |
return self; | |
else | |
return Qnil; | |
} | |
static VALUE mmseg_start(VALUE self) { | |
return rb_iv_get(self, "@start"); | |
} | |
static VALUE mmseg_end(VALUE self) { | |
return rb_iv_get(self, "@end"); | |
} | |
static VALUE | |
mmseg_settext | |
(VALUE self, VALUE str) | |
{ | |
int len; | |
const char* pstr; | |
if (TYPE(str) == T_STRING) { | |
str = rb_funcall(str, rb_intern("encode"), 1, rb_str_new2("utf-8")); | |
len = RSTRING_LEN(str); | |
pstr = StringValuePtr(str); | |
//printf("%d:%s\n",len,pstr); | |
}else | |
return Qnil; | |
css::Segmenter* seg = NULL; | |
Data_Get_Struct(self, css::Segmenter, seg); | |
//printf("%s",pstr); | |
seg->setBuffer((u1*)pstr,len); | |
rb_iv_set(self, "@str", str); | |
rb_iv_set(self, "@start", INT2NUM(0)); | |
rb_iv_set(self, "@end", INT2NUM(0)); | |
return self; | |
} | |
static VALUE | |
mmseg_open | |
(VALUE self, VALUE dict_path, VALUE str) | |
{ | |
int len; | |
const char* pstr; | |
if (TYPE(str) == T_STRING) { | |
str = rb_funcall(str, rb_intern("encode"), 1, rb_str_new2("utf-8")); | |
len = RSTRING_LEN(str); | |
pstr = StringValuePtr(str); | |
//printf("%d:%s\n",len,pstr); | |
}else | |
return Qnil; | |
if (!g_bInited && TYPE(dict_path) == T_STRING) { | |
int nRet = g_mgr.init(StringValuePtr(dict_path)); | |
if(nRet != 0) { | |
// should throw an exception | |
rb_fatal("Can NOT init the segment library."); | |
return Qnil; | |
} | |
g_bInited = 1; | |
} | |
if(g_bInited){ | |
//do segment | |
css::Segmenter* seg = g_mgr.getSegmenter(); | |
//hacking | |
long ptr = (long)seg; | |
seg->setBuffer((u1*)pstr,len); | |
self = Data_Wrap_Struct(self, NULL, MMSEG_FREE, (void *)seg); | |
}else | |
return Qnil; | |
rb_iv_set(self, "@str", str); | |
return self; | |
} | |
VALUE cMMseg; | |
void Init_mmseg() { | |
cMMseg = rb_define_class("Mmseg", rb_cData); | |
rb_define_alloc_func(cMMseg, mmseg_s_allocate); | |
rb_define_singleton_method(cMMseg, "createSeg", RUBY_METHOD_FUNC(mmseg_open), 2); | |
rb_define_method(cMMseg, "initialize", RUBY_METHOD_FUNC(mmseg_initialize), 0); | |
rb_define_method(cMMseg, "setText", RUBY_METHOD_FUNC(mmseg_settext), 1); | |
rb_define_method(cMMseg, "next", RUBY_METHOD_FUNC(mmseg_next), 0); | |
rb_define_method(cMMseg, "start", RUBY_METHOD_FUNC(mmseg_start), 0); | |
rb_define_method(cMMseg, "end", RUBY_METHOD_FUNC(mmseg_end), 0); | |
} | |
#ifdef __cplusplus | |
} | |
#endif |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# a silly bug return index of utf-8 c-string rather than character index = = | |
*** rubyapi.cpp 2014-11-20 14:34:45.735467250 +0800 | |
--- rubyapi.cpp 2010-05-03 03:16:17.000000000 +0800 | |
*************** | |
*** 104,127 **** | |
DATA_PTR(self) = NULL; | |
rb_iv_set(self, "@start", INT2NUM(0)); | |
rb_iv_set(self, "@end", INT2NUM(0)); | |
- rb_iv_set(self, "@ri", INT2NUM(0)); | |
return self; | |
} | |
static VALUE mmseg_next(VALUE self) | |
{ | |
u2 tok_len = 0; | |
! int nPos = 0, ri; | |
css::Segmenter* seg = NULL; | |
- VALUE v_str = rb_iv_get(self, "@str"); | |
- char* str = StringValuePtr(v_str); | |
Data_Get_Struct(self, css::Segmenter, seg); | |
//printf("%d",seg); //check is got it | |
if(seg){ | |
u2 len = 0, symlen = 0; | |
char* tok = (char*)seg->peekToken(len,symlen); | |
//printf("%s\t",tok); | |
- //printf("xxx %d : %d -- %d\n", len, symlen, tok); | |
//FIXME: if ruby version do not enbale symlen, the len and symlen always the same. | |
if(!tok || !*tok || !len) | |
tok_len = 0; | |
--- 104,123 ---- | |
DATA_PTR(self) = NULL; | |
rb_iv_set(self, "@start", INT2NUM(0)); | |
rb_iv_set(self, "@end", INT2NUM(0)); | |
return self; | |
} | |
static VALUE mmseg_next(VALUE self) | |
{ | |
u2 tok_len = 0; | |
! int nPos = 0; | |
css::Segmenter* seg = NULL; | |
Data_Get_Struct(self, css::Segmenter, seg); | |
//printf("%d",seg); //check is got it | |
if(seg){ | |
u2 len = 0, symlen = 0; | |
char* tok = (char*)seg->peekToken(len,symlen); | |
//printf("%s\t",tok); | |
//FIXME: if ruby version do not enbale symlen, the len and symlen always the same. | |
if(!tok || !*tok || !len) | |
tok_len = 0; | |
*************** | |
*** 134,149 **** | |
if(!NIL_P(vPos)){ | |
nPos = FIX2INT(vPos); | |
} | |
- VALUE v_ri = rb_iv_get(self, "@ri"); | |
- if (NIL_P(v_ri)) ri = 0; | |
- else ri = FIX2INT(v_ri); | |
- rb_iv_set(self, "@ri", INT2NUM(ri + tok_len)); | |
- int rlen = 0, i; | |
- for (i=ri;i<ri+tok_len;++i) | |
- if ((str[i] & 0xc0) != 0x80) | |
- rlen++; | |
rb_iv_set(self, "@start", INT2NUM(nPos)); | |
! rb_iv_set(self, "@end", INT2NUM(nPos+rlen)); | |
if(tok_len) | |
return self; | |
else | |
--- 130,137 ---- | |
if(!NIL_P(vPos)){ | |
nPos = FIX2INT(vPos); | |
} | |
rb_iv_set(self, "@start", INT2NUM(nPos)); | |
! rb_iv_set(self, "@end", INT2NUM(nPos+tok_len)); | |
if(tok_len) | |
return self; | |
else | |
*************** | |
*** 166,174 **** | |
int len; | |
const char* pstr; | |
if (TYPE(str) == T_STRING) { | |
- str = rb_funcall(str, rb_intern("encode"), 1, rb_str_new2("utf-8")); | |
len = RSTRING_LEN(str); | |
! pstr = StringValuePtr(str); | |
//printf("%d:%s\n",len,pstr); | |
}else | |
return Qnil; | |
--- 154,161 ---- | |
int len; | |
const char* pstr; | |
if (TYPE(str) == T_STRING) { | |
len = RSTRING_LEN(str); | |
! pstr = STR2CSTR(str); | |
//printf("%d:%s\n",len,pstr); | |
}else | |
return Qnil; | |
*************** | |
*** 177,183 **** | |
Data_Get_Struct(self, css::Segmenter, seg); | |
//printf("%s",pstr); | |
seg->setBuffer((u1*)pstr,len); | |
- rb_iv_set(self, "@str", str); | |
rb_iv_set(self, "@start", INT2NUM(0)); | |
rb_iv_set(self, "@end", INT2NUM(0)); | |
return self; | |
--- 164,169 ---- | |
*************** | |
*** 191,205 **** | |
int len; | |
const char* pstr; | |
if (TYPE(str) == T_STRING) { | |
- str = rb_funcall(str, rb_intern("encode"), 1, rb_str_new2("utf-8")); | |
len = RSTRING_LEN(str); | |
! pstr = StringValuePtr(str); | |
//printf("%d:%s\n",len,pstr); | |
}else | |
return Qnil; | |
if (!g_bInited && TYPE(dict_path) == T_STRING) { | |
! int nRet = g_mgr.init(StringValuePtr(dict_path)); | |
if(nRet != 0) { | |
// should throw an exception | |
rb_fatal("Can NOT init the segment library."); | |
--- 177,190 ---- | |
int len; | |
const char* pstr; | |
if (TYPE(str) == T_STRING) { | |
len = RSTRING_LEN(str); | |
! pstr = STR2CSTR(str); | |
//printf("%d:%s\n",len,pstr); | |
}else | |
return Qnil; | |
if (!g_bInited && TYPE(dict_path) == T_STRING) { | |
! int nRet = g_mgr.init(STR2CSTR(dict_path)); | |
if(nRet != 0) { | |
// should throw an exception | |
rb_fatal("Can NOT init the segment library."); | |
*************** | |
*** 217,223 **** | |
}else | |
return Qnil; | |
- rb_iv_set(self, "@str", str); | |
return self; | |
} | |
--- 202,207 ---- | |
*************** | |
*** 236,239 **** | |
#ifdef __cplusplus | |
} | |
! #endif | |
--- 220,223 ---- | |
#ifdef __cplusplus | |
} | |
! #endif | |
\ 文件尾没有 newline 字符 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment