Created
February 13, 2025 21:43
-
-
Save a10y/ced7051d73c5a2dead1efcbe5639ccbb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
plugins { | |
`java-library` | |
`maven-publish` | |
`signing` | |
} | |
val tokenizerSharedLibrary by configurations.creating { | |
isCanBeConsumed = false | |
} | |
dependencies { | |
tokenizerSharedLibrary(project(path = ":tokenizers-rs", configuration = "tokenizerSharedLibrary")) | |
api("net.java.dev.jna:jna-platform") | |
implementation("com.google.guava:guava") | |
implementation("org.slf4j:slf4j-api") | |
testImplementation(platform("org.junit:junit-bom")) | |
testImplementation("org.junit.jupiter:junit-jupiter-engine") | |
testImplementation("org.assertj:assertj-core") | |
} | |
val osName = System.getProperty("os.name") | |
val osArch = when (System.getProperty("os.arch")) { | |
"amd64" -> "x86-64" | |
"x86_64" -> "x86-64" | |
else -> System.getProperty("os.arch") | |
} | |
println("System: $osName $osArch") | |
val resourceDir = if (osName.startsWith("Mac")) { | |
"darwin-$osArch" | |
} else { | |
"linux-$osArch" | |
} | |
println("resourceDirname = $resourceDir") | |
val copySharedLibrary by tasks.registering(Copy::class) { | |
from(tokenizerSharedLibrary) | |
into(projectDir.resolve("src/main/resources/$resourceDir")) | |
doLast { | |
println("Copied ${tokenizerSharedLibrary.files.size} files into resource directory") | |
} | |
} | |
tasks.withType<ProcessResources>().configureEach { | |
dependsOn(copySharedLibrary) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* (c) Copyright 2023 Intrinsic Labs, LLC. All rights reserved. | |
* | |
* Licensed under the Apache License, Version 2.0 (the "License"); | |
* you may not use this file except in compliance with the License. | |
* You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
package ai.intrinsiclabs.tokenizers.jna; | |
import com.google.errorprone.annotations.MustBeClosed; | |
import com.sun.jna.Native; | |
import com.sun.jna.Pointer; | |
/** | |
* Java Native Access Direct Mapping class holding methods that map to native calls to our {@code jtokenizers} Rust | |
* library. | |
*/ | |
public final class TokenizerLibrary { | |
static { | |
Native.register("jtokenizers"); | |
} | |
/** | |
* Create a tokenizer from a {@code "tokenizers.json"} file. | |
*/ | |
public static native TokenizerPtr newTokenizerFromFile(String path); | |
static native TokenizerPtr newTokenizerFromString(String json); | |
static native void dropTokenizer(TokenizerPtr tokenizerPtr); | |
static native void dropTokenization(Pointer tokenization); | |
@MustBeClosed | |
static native NativeTokenization tokenize(TokenizerPtr ptr, String text); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Create a TokenizersFastPretrained Java class, bind methods to calls | |
// Return a result return type. | |
// Dynamically allocate some memory, push it back to the user, whatever. | |
use std::{ | |
ffi::{c_char, CStr}, | |
str::FromStr, | |
}; | |
use tokenizers::Tokenizer; | |
#[repr(C)] | |
pub struct Tokenization { | |
tokens: Box<TensorWrapper>, | |
attns: Box<TensorWrapper>, | |
type_ids: Box<TensorWrapper>, | |
} | |
#[repr(C)] | |
pub struct TensorWrapper { | |
ptr: Box<[i64]>, | |
len: i32, | |
} | |
// Uncomment for debugging memory leaks | |
// impl Drop for Tokenization { | |
// fn drop(&mut self) { | |
// println!("Dropping tokenization"); | |
// } | |
// } | |
// | |
// impl Drop for TensorWrapper { | |
// fn drop(&mut self) { | |
// println!("Dropping TensorWrapper"); | |
// } | |
// } | |
/// Create a new tokenizer, loading a HuggingFace Tokenizer instance | |
/// from a `tokenizers.json` file. | |
#[no_mangle] | |
pub extern "C" fn newTokenizerFromFile(path: *const c_char) -> Box<Tokenizer> { | |
let file_path = to_string(path); | |
let tokenizer = Tokenizer::from_file(&file_path).unwrap(); | |
return Box::new(tokenizer); | |
} | |
/// Create a new tokenizer, loading a HuggingFace Tokenizer instance | |
/// from a `tokenizers.json` file serialized as an in-memory string. | |
#[no_mangle] | |
pub extern "C" fn newTokenizerFromString(text: *const c_char) -> Box<Tokenizer> { | |
let text_str = to_string(text); | |
let tokenizer = Tokenizer::from_str(&text_str).unwrap(); | |
return Box::new(tokenizer); | |
} | |
/// Tokenize the given input string using the passed tokenizer | |
/// Return the given array as an approach here instead. | |
#[no_mangle] | |
pub extern "C" fn tokenize(tokenizer_ptr: &Tokenizer, text: *const c_char) -> Box<Tokenization> { | |
let text_str = to_string(text); | |
let encoding = tokenizer_ptr | |
.encode(text_str, false) | |
.expect("Rust Tokenizer failed"); | |
let token_ids = encoding | |
.get_ids() | |
.iter() | |
.map(|f| *f as i64) | |
.collect::<Vec<i64>>() | |
.into_boxed_slice(); | |
let attention_mask = encoding | |
.get_attention_mask() | |
.iter() | |
.map(|f| *f as i64) | |
.collect::<Vec<i64>>() | |
.into_boxed_slice(); | |
let type_ids = encoding | |
.get_type_ids() | |
.iter() | |
.map(|f| *f as i64) | |
.collect::<Vec<i64>>() | |
.into_boxed_slice(); | |
Box::new(Tokenization { | |
tokens: Box::new(TensorWrapper { | |
len: token_ids.len() as i32, | |
ptr: token_ids, | |
}), | |
attns: Box::new(TensorWrapper { | |
len: attention_mask.len() as i32, | |
ptr: attention_mask, | |
}), | |
type_ids: Box::new(TensorWrapper { | |
len: type_ids.len() as i32, | |
ptr: type_ids, | |
}), | |
}) | |
} | |
#[no_mangle] | |
pub extern "C" fn dropTokenizer(_: Box<Tokenizer>) { | |
// Do nothing here. Because we own the parameter, it will go out of scope and the memory | |
// will be freed. | |
} | |
#[no_mangle] | |
pub extern "C" fn dropTokenization(_: Box<Tokenization>) { | |
// Do nothing, the passed value and all the internal Box<TensorWrapper> will be deinited on exit. | |
} | |
fn to_string(pointer: *const c_char) -> String { | |
let slice = unsafe { CStr::from_ptr(pointer).to_bytes() }; | |
String::from_utf8(slice.into()).unwrap() | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment