Skip to content

Instantly share code, notes, and snippets.

@a10y
Created February 13, 2025 21:43
Show Gist options
  • Save a10y/ced7051d73c5a2dead1efcbe5639ccbb to your computer and use it in GitHub Desktop.
Save a10y/ced7051d73c5a2dead1efcbe5639ccbb to your computer and use it in GitHub Desktop.
plugins {
`java-library`
`maven-publish`
`signing`
}
val tokenizerSharedLibrary by configurations.creating {
isCanBeConsumed = false
}
dependencies {
tokenizerSharedLibrary(project(path = ":tokenizers-rs", configuration = "tokenizerSharedLibrary"))
api("net.java.dev.jna:jna-platform")
implementation("com.google.guava:guava")
implementation("org.slf4j:slf4j-api")
testImplementation(platform("org.junit:junit-bom"))
testImplementation("org.junit.jupiter:junit-jupiter-engine")
testImplementation("org.assertj:assertj-core")
}
val osName = System.getProperty("os.name")
val osArch = when (System.getProperty("os.arch")) {
"amd64" -> "x86-64"
"x86_64" -> "x86-64"
else -> System.getProperty("os.arch")
}
println("System: $osName $osArch")
val resourceDir = if (osName.startsWith("Mac")) {
"darwin-$osArch"
} else {
"linux-$osArch"
}
println("resourceDirname = $resourceDir")
val copySharedLibrary by tasks.registering(Copy::class) {
from(tokenizerSharedLibrary)
into(projectDir.resolve("src/main/resources/$resourceDir"))
doLast {
println("Copied ${tokenizerSharedLibrary.files.size} files into resource directory")
}
}
tasks.withType<ProcessResources>().configureEach {
dependsOn(copySharedLibrary)
}
/*
* (c) Copyright 2023 Intrinsic Labs, LLC. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ai.intrinsiclabs.tokenizers.jna;
import com.google.errorprone.annotations.MustBeClosed;
import com.sun.jna.Native;
import com.sun.jna.Pointer;
/**
* Java Native Access Direct Mapping class holding methods that map to native calls to our {@code jtokenizers} Rust
* library.
*/
public final class TokenizerLibrary {
static {
Native.register("jtokenizers");
}
/**
* Create a tokenizer from a {@code "tokenizers.json"} file.
*/
public static native TokenizerPtr newTokenizerFromFile(String path);
static native TokenizerPtr newTokenizerFromString(String json);
static native void dropTokenizer(TokenizerPtr tokenizerPtr);
static native void dropTokenization(Pointer tokenization);
@MustBeClosed
static native NativeTokenization tokenize(TokenizerPtr ptr, String text);
}
// Create a TokenizersFastPretrained Java class, bind methods to calls
// Return a result return type.
// Dynamically allocate some memory, push it back to the user, whatever.
use std::{
ffi::{c_char, CStr},
str::FromStr,
};
use tokenizers::Tokenizer;
#[repr(C)]
pub struct Tokenization {
tokens: Box<TensorWrapper>,
attns: Box<TensorWrapper>,
type_ids: Box<TensorWrapper>,
}
#[repr(C)]
pub struct TensorWrapper {
ptr: Box<[i64]>,
len: i32,
}
// Uncomment for debugging memory leaks
// impl Drop for Tokenization {
// fn drop(&mut self) {
// println!("Dropping tokenization");
// }
// }
//
// impl Drop for TensorWrapper {
// fn drop(&mut self) {
// println!("Dropping TensorWrapper");
// }
// }
/// Create a new tokenizer, loading a HuggingFace Tokenizer instance
/// from a `tokenizers.json` file.
#[no_mangle]
pub extern "C" fn newTokenizerFromFile(path: *const c_char) -> Box<Tokenizer> {
let file_path = to_string(path);
let tokenizer = Tokenizer::from_file(&file_path).unwrap();
return Box::new(tokenizer);
}
/// Create a new tokenizer, loading a HuggingFace Tokenizer instance
/// from a `tokenizers.json` file serialized as an in-memory string.
#[no_mangle]
pub extern "C" fn newTokenizerFromString(text: *const c_char) -> Box<Tokenizer> {
let text_str = to_string(text);
let tokenizer = Tokenizer::from_str(&text_str).unwrap();
return Box::new(tokenizer);
}
/// Tokenize the given input string using the passed tokenizer
/// Return the given array as an approach here instead.
#[no_mangle]
pub extern "C" fn tokenize(tokenizer_ptr: &Tokenizer, text: *const c_char) -> Box<Tokenization> {
let text_str = to_string(text);
let encoding = tokenizer_ptr
.encode(text_str, false)
.expect("Rust Tokenizer failed");
let token_ids = encoding
.get_ids()
.iter()
.map(|f| *f as i64)
.collect::<Vec<i64>>()
.into_boxed_slice();
let attention_mask = encoding
.get_attention_mask()
.iter()
.map(|f| *f as i64)
.collect::<Vec<i64>>()
.into_boxed_slice();
let type_ids = encoding
.get_type_ids()
.iter()
.map(|f| *f as i64)
.collect::<Vec<i64>>()
.into_boxed_slice();
Box::new(Tokenization {
tokens: Box::new(TensorWrapper {
len: token_ids.len() as i32,
ptr: token_ids,
}),
attns: Box::new(TensorWrapper {
len: attention_mask.len() as i32,
ptr: attention_mask,
}),
type_ids: Box::new(TensorWrapper {
len: type_ids.len() as i32,
ptr: type_ids,
}),
})
}
#[no_mangle]
pub extern "C" fn dropTokenizer(_: Box<Tokenizer>) {
// Do nothing here. Because we own the parameter, it will go out of scope and the memory
// will be freed.
}
#[no_mangle]
pub extern "C" fn dropTokenization(_: Box<Tokenization>) {
// Do nothing, the passed value and all the internal Box<TensorWrapper> will be deinited on exit.
}
fn to_string(pointer: *const c_char) -> String {
let slice = unsafe { CStr::from_ptr(pointer).to_bytes() };
String::from_utf8(slice.into()).unwrap()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment