mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Node - Add pre tokenizers
This commit is contained in:
@ -5,6 +5,7 @@ mod decoders;
|
||||
mod encoding;
|
||||
mod models;
|
||||
mod normalizers;
|
||||
mod pre_tokenizers;
|
||||
mod processors;
|
||||
mod tasks;
|
||||
mod tokenizer;
|
||||
@ -23,6 +24,8 @@ register_module!(mut m, {
|
||||
processors::register(&mut m, "processors")?;
|
||||
// Normalizers
|
||||
normalizers::register(&mut m, "normalizers")?;
|
||||
// PreTokenizers
|
||||
pre_tokenizers::register(&mut m, "pre_tokenizers")?;
|
||||
|
||||
Ok(())
|
||||
});
|
||||
|
111
bindings/node/native/src/pre_tokenizers.rs
Normal file
111
bindings/node/native/src/pre_tokenizers.rs
Normal file
@ -0,0 +1,111 @@
|
||||
extern crate tokenizers as tk;
|
||||
|
||||
use crate::utils::Container;
|
||||
use neon::prelude::*;
|
||||
|
||||
/// PreTokenizers
|
||||
pub struct PreTokenizer {
|
||||
pub pretok: Container<dyn tk::tokenizer::PreTokenizer + Sync>,
|
||||
}
|
||||
|
||||
declare_types! {
|
||||
pub class JsPreTokenizer for PreTokenizer {
|
||||
init(_) {
|
||||
// This should not be called from JS
|
||||
Ok(PreTokenizer {
|
||||
pretok: Container::Empty
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// byte_level(addPrefixSpace: bool = true)
|
||||
fn byte_level(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
|
||||
let mut add_prefix_space = true;
|
||||
|
||||
if let Some(args) = cx.argument_opt(0) {
|
||||
add_prefix_space = args.downcast::<JsBoolean>().or_throw(&mut cx)?.value();
|
||||
}
|
||||
|
||||
let mut pretok = JsPreTokenizer::new::<_, JsPreTokenizer, _>(&mut cx, vec![])?;
|
||||
let guard = cx.lock();
|
||||
pretok.borrow_mut(&guard).pretok.to_owned(Box::new(
|
||||
tk::pre_tokenizers::byte_level::ByteLevel::new(add_prefix_space),
|
||||
));
|
||||
Ok(pretok)
|
||||
}
|
||||
|
||||
/// byte_level_alphabet()
|
||||
fn byte_level_alphabet(mut cx: FunctionContext) -> JsResult<JsArray> {
|
||||
let chars = tk::pre_tokenizers::byte_level::ByteLevel::alphabet()
|
||||
.into_iter()
|
||||
.map(|c| c.to_string())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let js_chars = JsArray::new(&mut cx, chars.len() as u32);
|
||||
for (i, c) in chars.into_iter().enumerate() {
|
||||
let s = cx.string(c);
|
||||
js_chars.set(&mut cx, i as u32, s)?;
|
||||
}
|
||||
|
||||
Ok(js_chars)
|
||||
}
|
||||
|
||||
/// whitespace()
|
||||
fn whitespace(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
|
||||
let mut pretok = JsPreTokenizer::new::<_, JsPreTokenizer, _>(&mut cx, vec![])?;
|
||||
let guard = cx.lock();
|
||||
pretok
|
||||
.borrow_mut(&guard)
|
||||
.pretok
|
||||
.to_owned(Box::new(tk::pre_tokenizers::whitespace::Whitespace));
|
||||
Ok(pretok)
|
||||
}
|
||||
|
||||
/// bert_pre_tokenizer()
|
||||
fn bert_pre_tokenizer(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
|
||||
let mut pretok = JsPreTokenizer::new::<_, JsPreTokenizer, _>(&mut cx, vec![])?;
|
||||
let guard = cx.lock();
|
||||
pretok
|
||||
.borrow_mut(&guard)
|
||||
.pretok
|
||||
.to_owned(Box::new(tk::pre_tokenizers::bert::BertPreTokenizer));
|
||||
Ok(pretok)
|
||||
}
|
||||
|
||||
/// metaspace(replacement: string = '_', addPrefixSpace: bool = true)
|
||||
fn metaspace(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
|
||||
let mut replacement = '▁';
|
||||
if let Some(args) = cx.argument_opt(0) {
|
||||
let rep = args.downcast::<JsString>().or_throw(&mut cx)?.value() as String;
|
||||
replacement = rep.chars().nth(0).ok_or_else(|| {
|
||||
cx.throw_error::<_, ()>("replacement must be a character")
|
||||
.unwrap_err()
|
||||
})?;
|
||||
};
|
||||
|
||||
let mut add_prefix_space = true;
|
||||
if let Some(args) = cx.argument_opt(1) {
|
||||
add_prefix_space = args.downcast::<JsBoolean>().or_throw(&mut cx)?.value() as bool;
|
||||
}
|
||||
|
||||
let mut pretok = JsPreTokenizer::new::<_, JsPreTokenizer, _>(&mut cx, vec![])?;
|
||||
let guard = cx.lock();
|
||||
pretok.borrow_mut(&guard).pretok.to_owned(Box::new(
|
||||
tk::pre_tokenizers::metaspace::Metaspace::new(replacement, add_prefix_space),
|
||||
));
|
||||
Ok(pretok)
|
||||
}
|
||||
|
||||
/// Register everything here
|
||||
pub fn register(m: &mut ModuleContext, prefix: &str) -> NeonResult<()> {
|
||||
m.export_function(&format!("{}_ByteLevel", prefix), byte_level)?;
|
||||
m.export_function(
|
||||
&format!("{}_ByteLevel_Alphabet", prefix),
|
||||
byte_level_alphabet,
|
||||
)?;
|
||||
m.export_function(&format!("{}_Whitespace", prefix), whitespace)?;
|
||||
m.export_function(&format!("{}_BertPreTokenizer", prefix), bert_pre_tokenizer)?;
|
||||
m.export_function(&format!("{}_Metaspace", prefix), metaspace)?;
|
||||
Ok(())
|
||||
}
|
Reference in New Issue
Block a user