Skip to content

Instantly share code, notes, and snippets.

@ColonelThirtyTwo
Last active September 24, 2023 22:08
Show Gist options
  • Save ColonelThirtyTwo/3dd1fe04e4cff0502fa70d12f3a6e72e to your computer and use it in GitHub Desktop.
Save ColonelThirtyTwo/3dd1fe04e4cff0502fa70d12f3a6e72e to your computer and use it in GitHub Desktop.
Rusqlite FTS5 tokenizer module
use any_ascii::any_ascii_char;
use unicode_normalization::UnicodeNormalization;
use unicode_segmentation::UnicodeSegmentation;
use crate::sqlite3_fts5::Tokenizer;
/// My own tokenizer
///
/// The operations the tokenizer performs, in order:
/// 1. Splits data on Unicode-defined words (`UnicodeSegmentation::unicode_word_indices`).
/// 2. Converts the words to nfkc.
/// 3. Converts the words to ascii using `any_ascii`
/// 4. Ascii-lowercases the words
/// 5. Stems the word using the porter algorithm.
///
/// Should be fairly Unicode-aware whilen retaining searchability on a US keyboard.
pub struct ColTokenizer;
impl Tokenizer for ColTokenizer {
type Global = ();
fn new(&(): &Self::Global, _args: Vec<String>) -> Result<Self, rusqlite::Error> {
Ok(Self)
}
fn tokenize<TKF>(
&mut self,
_reason: crate::sqlite3_fts5::TokenizeReason,
text: &[u8],
mut push_token: TKF,
) -> Result<(), rusqlite::Error>
where
TKF: FnMut(&[u8], std::ops::Range<usize>, bool) -> Result<(), rusqlite::Error>,
{
let text = String::from_utf8_lossy(text);
let mut ascii_buffer = String::new();
let mut stemmed_buffer = String::new();
for (i, word) in text.unicode_word_indices() {
let range = i..i + word.len();
ascii_buffer.clear();
ascii_buffer.extend(word.nfkc().map(any_ascii_char));
ascii_buffer.make_ascii_lowercase();
let mut graphemes = ascii_buffer.graphemes(true).collect();
crate::porter_stemmer::stem_tokenized(&mut graphemes);
stemmed_buffer.clear();
stemmed_buffer.extend(graphemes.into_iter());
(push_token)(stemmed_buffer.as_bytes(), range, false)?;
}
Ok(())
}
}
use std::{
convert::TryFrom,
ffi::{
c_void,
CStr,
CString,
},
os::raw::{
c_char,
c_int,
},
panic::{
catch_unwind,
AssertUnwindSafe,
},
};
use rusqlite::ffi::{
self,
SQLITE_ERROR,
SQLITE_OK,
};
/// Combined so we can pattern match on it
const FTS5_TOKENIZE_QUERY_PREFIX: c_int = ffi::FTS5_TOKENIZE_QUERY | ffi::FTS5_TOKENIZE_PREFIX;
/// Reason the tokenizer is being called
pub enum TokenizeReason {
/// Document is being inseted or removed
Document,
/// Running a MATCH query.
Query {
/// Whether this is a prefix query. If so, the last token emitted will be treated as a token prefix.
prefix: bool,
},
/// Manually invoked via `fts5_api.xTokenize`.
Aux,
}
impl TokenizeReason {
fn from_const(v: c_int) -> Option<Self> {
let v = match v {
ffi::FTS5_TOKENIZE_DOCUMENT => Self::Document,
ffi::FTS5_TOKENIZE_QUERY => Self::Query { prefix: false },
FTS5_TOKENIZE_QUERY_PREFIX => Self::Query { prefix: true },
ffi::FTS5_TOKENIZE_AUX => Self::Aux,
_ => return None,
};
Some(v)
}
}
/// Tokenizer implementation
pub trait Tokenizer: Sized + Send + 'static {
/// Global data available to the `new` function
type Global: Send + 'static;
/// Creates a new instance of the tokenizer
fn new(global: &Self::Global, args: Vec<String>) -> Result<Self, rusqlite::Error>;
/// Tokenizes a string.
/// Should inspect the `text` object and call the `push_token` callback for each token.
/// The callback takes 3 arguments: the token, the location within the `text` the token appears in,
/// and a boolean flag that corresponds to the `FTS5_TOKEN_COLOCATED` flag.
fn tokenize<TKF>(
&mut self,
reason: TokenizeReason,
text: &[u8],
push_token: TKF,
) -> Result<(), rusqlite::Error>
where
TKF: FnMut(&[u8], std::ops::Range<usize>, bool) -> Result<(), rusqlite::Error>;
}
unsafe extern "C" fn c_xcreate<T: Tokenizer>(
global: *mut c_void,
args: *mut *const c_char,
nargs: c_int,
out_tok: *mut *mut ffi::Fts5Tokenizer,
) -> c_int {
let global = &*global.cast::<T::Global>();
let args = (0..nargs as usize)
.map(|i| *args.add(i))
.map(|s| CStr::from_ptr(s).to_string_lossy().into_owned())
.collect::<Vec<_>>();
let res = catch_unwind(AssertUnwindSafe(move || T::new(global, args)));
match res {
Ok(Ok(v)) => {
let bp = Box::into_raw(Box::new(v));
*out_tok = bp.cast::<ffi::Fts5Tokenizer>();
SQLITE_OK
}
Ok(Err(rusqlite::Error::SqliteFailure(e, _))) => e.extended_code,
Ok(Err(_)) => SQLITE_ERROR,
Err(msg) => {
error!(
"<{} as Tokenizer>::new paniced: {}",
std::any::type_name::<T>(),
panic_err_to_str(&msg)
);
SQLITE_ERROR
}
}
}
unsafe extern "C" fn c_xdelete<T: Tokenizer>(v: *mut ffi::Fts5Tokenizer) {
let b = Box::from_raw(v.cast::<T>());
match catch_unwind(AssertUnwindSafe(move || std::mem::drop(b))) {
Ok(()) => {}
Err(e) => {
error!(
"{}::drop paniced: {}",
std::any::type_name::<T>(),
panic_err_to_str(&e)
);
}
}
}
unsafe extern "C" fn c_xdestroy<T: Tokenizer>(v: *mut c_void) {
let b = Box::from_raw(v.cast::<T::Global>());
match catch_unwind(AssertUnwindSafe(move || std::mem::drop(b))) {
Ok(()) => {}
Err(e) => {
error!(
"{}::drop paniced: {}",
std::any::type_name::<T::Global>(),
panic_err_to_str(&e)
);
}
}
}
unsafe extern "C" fn c_xtokenize<T: Tokenizer>(
this: *mut ffi::Fts5Tokenizer,
ctx: *mut c_void,
flags: c_int,
data: *const c_char,
data_len: c_int,
push_token: Option<
unsafe extern "C" fn(*mut c_void, c_int, *const c_char, c_int, c_int, c_int) -> c_int,
>,
) -> c_int {
let this = &mut *this.cast::<T>();
let reason = match TokenizeReason::from_const(flags) {
Some(v) => v,
None => {
error!("Unrecognized flags passed to xTokenize: {}", flags);
return SQLITE_ERROR;
}
};
let data = std::slice::from_raw_parts(data.cast::<u8>(), data_len as usize);
let push_token = push_token.unwrap();
let push_token = |token: &[u8],
range: std::ops::Range<usize>,
colocated: bool|
-> Result<(), rusqlite::Error> {
let ntoken = c_int::try_from(token.len()).expect("Token length is took long");
assert!(
range.start <= data.len() && range.end <= data.len(),
"Token range is invalid. Range is {:?}, data length is {}",
range,
data.len(),
);
let start = range.start as c_int;
let end = range.end as c_int;
let flags = if colocated {
ffi::FTS5_TOKEN_COLOCATED
} else {
0
};
let res = (push_token)(
ctx,
flags,
token.as_ptr().cast::<c_char>(),
ntoken,
start,
end,
);
if res == SQLITE_OK {
Ok(())
} else {
Err(rusqlite::Error::SqliteFailure(
rusqlite::ffi::Error::new(res),
None,
))
}
};
match catch_unwind(AssertUnwindSafe(|| this.tokenize(reason, data, push_token))) {
Ok(Ok(())) => SQLITE_OK,
Ok(Err(rusqlite::Error::SqliteFailure(e, _))) => e.extended_code,
Ok(Err(_)) => SQLITE_ERROR,
Err(msg) => {
error!(
"<{} as Tokenizer>::tokenize paniced: {}",
std::any::type_name::<T>(),
panic_err_to_str(&msg)
);
SQLITE_ERROR
}
}
}
fn panic_err_to_str(msg: &Box<dyn std::any::Any + Send>) -> &str {
if let Some(msg) = msg.downcast_ref::<String>() {
msg.as_str()
} else if let Some(msg) = msg.downcast_ref::<&'static str>() {
*msg
} else {
"<non-string panic reason>"
}
}
pub fn register_tokenizer<T: Tokenizer>(
db: &mut rusqlite::Connection,
global_data: T::Global,
name: &str,
) -> Result<(), String> {
unsafe {
let dbp = db.handle();
let mut api: *mut ffi::fts5_api = std::ptr::null_mut();
let mut stmt: *mut ffi::sqlite3_stmt = std::ptr::null_mut();
let q = "SELECT fts5(?1)";
if ffi::sqlite3_prepare(
dbp,
q.as_ptr().cast::<c_char>(),
q.len() as c_int,
&mut stmt,
std::ptr::null_mut(),
) != SQLITE_OK
{
return Err("sqlite3_prepare failed".into());
}
ffi::sqlite3_bind_pointer(
stmt,
1,
(&mut api) as *mut _ as *mut c_void,
"fts5_api_ptr\0".as_ptr().cast::<c_char>(),
None,
);
ffi::sqlite3_step(stmt);
ffi::sqlite3_finalize(stmt);
if api.is_null() {
return Err("Could not get fts5 api".into());
}
let name = CString::new(name).map_err(|_| "Name has a null character in it")?;
let global_data = Box::into_raw(Box::new(global_data));
let e = ((*api).xCreateTokenizer.as_ref().unwrap())(
api,
name.as_ptr(),
global_data.cast::<c_void>(),
&mut ffi::fts5_tokenizer {
xCreate: Some(c_xcreate::<T>),
xDelete: Some(c_xdelete::<T>),
xTokenize: Some(c_xtokenize::<T>),
},
Some(c_xdestroy::<T>),
);
if e != SQLITE_OK {
return Err("xCreateTokenizer failed".into());
}
Ok(())
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment