mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-06 04:38:23 +00:00
Char based delimiter splitting - TransfoXL (#114)
* WIP delimiter splitter Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * Bind on Python side. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * Add missing delimiter parameter in CharDelimiterSplit constructor. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * Attempt to provide CharDelimiterSplit for node. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * Apply Rust formatting. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * fix bindings node Co-authored-by: Pierric Cistac <Pierrci@users.noreply.github.com>
This commit is contained in:
11
bindings/node/lib/bindings/pre-tokenizers.d.ts
vendored
11
bindings/node/lib/bindings/pre-tokenizers.d.ts
vendored
@@ -45,7 +45,7 @@ export function whitespaceSplitPreTokenizer(): PreTokenizer;
|
|||||||
export function bertPreTokenizer(): PreTokenizer;
|
export function bertPreTokenizer(): PreTokenizer;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a new Metaspace Tokenizer.
|
* Returns a new Metaspace PreTokenizer.
|
||||||
* This pre-tokenizer replaces any whitespace by the provided replacement character.
|
* This pre-tokenizer replaces any whitespace by the provided replacement character.
|
||||||
* It then tries to split on these spaces.
|
* It then tries to split on these spaces.
|
||||||
*
|
*
|
||||||
@@ -58,3 +58,12 @@ export function metaspacePreTokenizer(
|
|||||||
replacement?: string,
|
replacement?: string,
|
||||||
addPrefixSpace?: boolean
|
addPrefixSpace?: boolean
|
||||||
): PreTokenizer;
|
): PreTokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a CharDelimiterSplit PreTokenizer
|
||||||
|
* This pre-tokenizer simply splits on the provided delimiter. Works almost like the `.split(delimiter)`
|
||||||
|
* function, except that it accounts for multiple consecutive spaces
|
||||||
|
*
|
||||||
|
* @param delimiter The delimiter character on which the sequence will be split.
|
||||||
|
*/
|
||||||
|
export function charDelimiterSplitPreTokenizer(delimiter: string): PreTokenizer;
|
||||||
|
|||||||
@@ -6,5 +6,6 @@ module.exports = {
|
|||||||
whitespacePreTokenizer: native.pre_tokenizers_Whitespace,
|
whitespacePreTokenizer: native.pre_tokenizers_Whitespace,
|
||||||
whitespaceSplitPreTokenizer: native.pre_tokenizers_WhitespaceSplit,
|
whitespaceSplitPreTokenizer: native.pre_tokenizers_WhitespaceSplit,
|
||||||
bertPreTokenizer: native.pre_tokenizers_BertPreTokenizer,
|
bertPreTokenizer: native.pre_tokenizers_BertPreTokenizer,
|
||||||
metaspacePreTokenizer: native.pre_tokenizers_Metaspace
|
metaspacePreTokenizer: native.pre_tokenizers_Metaspace,
|
||||||
|
charDelimiterSplitPreTokenizer: native.pre_tokenizers_CharDelimiterSplit
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -108,6 +108,23 @@ fn metaspace(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
|
|||||||
Ok(pretok)
|
Ok(pretok)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// char_delimiter_split(delimiter: string)
|
||||||
|
fn char_delimiter_split(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
|
||||||
|
let argument = cx.argument::<JsString>(0)?.value();
|
||||||
|
let delimiter = argument.chars().nth(0).ok_or_else(|| {
|
||||||
|
cx.throw_error::<_, ()>("delimiter must be a character")
|
||||||
|
.unwrap_err()
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let mut pretok = JsPreTokenizer::new::<_, JsPreTokenizer, _>(&mut cx, vec![])?;
|
||||||
|
let guard = cx.lock();
|
||||||
|
pretok.borrow_mut(&guard).pretok.to_owned(Box::new(
|
||||||
|
tk::pre_tokenizers::delimiter::CharDelimiterSplit::new(delimiter),
|
||||||
|
));
|
||||||
|
|
||||||
|
Ok(pretok)
|
||||||
|
}
|
||||||
|
|
||||||
/// Register everything here
|
/// Register everything here
|
||||||
pub fn register(m: &mut ModuleContext, prefix: &str) -> NeonResult<()> {
|
pub fn register(m: &mut ModuleContext, prefix: &str) -> NeonResult<()> {
|
||||||
m.export_function(&format!("{}_ByteLevel", prefix), byte_level)?;
|
m.export_function(&format!("{}_ByteLevel", prefix), byte_level)?;
|
||||||
@@ -119,5 +136,9 @@ pub fn register(m: &mut ModuleContext, prefix: &str) -> NeonResult<()> {
|
|||||||
m.export_function(&format!("{}_WhitespaceSplit", prefix), whitespace_split)?;
|
m.export_function(&format!("{}_WhitespaceSplit", prefix), whitespace_split)?;
|
||||||
m.export_function(&format!("{}_BertPreTokenizer", prefix), bert_pre_tokenizer)?;
|
m.export_function(&format!("{}_BertPreTokenizer", prefix), bert_pre_tokenizer)?;
|
||||||
m.export_function(&format!("{}_Metaspace", prefix), metaspace)?;
|
m.export_function(&format!("{}_Metaspace", prefix), metaspace)?;
|
||||||
|
m.export_function(
|
||||||
|
&format!("{}_CharDelimiterSplit", prefix),
|
||||||
|
char_delimiter_split,
|
||||||
|
)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -40,6 +40,7 @@ fn pre_tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
|||||||
m.add_class::<pre_tokenizers::WhitespaceSplit>()?;
|
m.add_class::<pre_tokenizers::WhitespaceSplit>()?;
|
||||||
m.add_class::<pre_tokenizers::BertPreTokenizer>()?;
|
m.add_class::<pre_tokenizers::BertPreTokenizer>()?;
|
||||||
m.add_class::<pre_tokenizers::Metaspace>()?;
|
m.add_class::<pre_tokenizers::Metaspace>()?;
|
||||||
|
m.add_class::<pre_tokenizers::CharDelimiterSplit>()?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -85,6 +85,23 @@ impl WhitespaceSplit {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[pyclass]
|
||||||
|
pub struct CharDelimiterSplit {}
|
||||||
|
#[pymethods]
|
||||||
|
impl CharDelimiterSplit {
|
||||||
|
#[staticmethod]
|
||||||
|
pub fn new(delimiter: &str) -> PyResult<PreTokenizer> {
|
||||||
|
let chr_delimiter = delimiter.chars().nth(0).ok_or(exceptions::Exception::py_err(
|
||||||
|
"delimiter must be a single character",
|
||||||
|
))?;
|
||||||
|
Ok(PreTokenizer{
|
||||||
|
pretok:Container::Owned(Box::new(
|
||||||
|
tk::pre_tokenizers::delimiter::CharDelimiterSplit::new(chr_delimiter)
|
||||||
|
))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[pyclass]
|
#[pyclass]
|
||||||
pub struct BertPreTokenizer {}
|
pub struct BertPreTokenizer {}
|
||||||
#[pymethods]
|
#[pymethods]
|
||||||
|
|||||||
@@ -6,3 +6,4 @@ Whitespace = pre_tokenizers.Whitespace
|
|||||||
WhitespaceSplit = pre_tokenizers.WhitespaceSplit
|
WhitespaceSplit = pre_tokenizers.WhitespaceSplit
|
||||||
BertPreTokenizer = pre_tokenizers.BertPreTokenizer
|
BertPreTokenizer = pre_tokenizers.BertPreTokenizer
|
||||||
Metaspace = pre_tokenizers.Metaspace
|
Metaspace = pre_tokenizers.Metaspace
|
||||||
|
CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ class PreTokenizer:
|
|||||||
""" Pre tokenize the given sequence """
|
""" Pre tokenize the given sequence """
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class ByteLevel:
|
class ByteLevel(PreTokenizer):
|
||||||
""" ByteLevel PreTokenizer
|
""" ByteLevel PreTokenizer
|
||||||
|
|
||||||
This pre-tokenizer takes care of replacing all bytes of the given string
|
This pre-tokenizer takes care of replacing all bytes of the given string
|
||||||
@@ -44,7 +44,7 @@ class ByteLevel:
|
|||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class Whitespace:
|
class Whitespace(PreTokenizer):
|
||||||
""" Whitespace PreTokenizer
|
""" Whitespace PreTokenizer
|
||||||
|
|
||||||
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
|
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
|
||||||
@@ -55,7 +55,7 @@ class Whitespace:
|
|||||||
""" Instantiate a new Whitespace PreTokenizer """
|
""" Instantiate a new Whitespace PreTokenizer """
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class WhitespaceSplit:
|
class WhitespaceSplit(PreTokenizer):
|
||||||
""" Whitespace PreTokenizer
|
""" Whitespace PreTokenizer
|
||||||
|
|
||||||
This pre-tokenizer simply splits on the whitespace. Works like `.split()`
|
This pre-tokenizer simply splits on the whitespace. Works like `.split()`
|
||||||
@@ -66,7 +66,7 @@ class WhitespaceSplit:
|
|||||||
""" Instantiate a new WhitespaceSplit PreTokenizer """
|
""" Instantiate a new WhitespaceSplit PreTokenizer """
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class BertPreTokenizer:
|
class BertPreTokenizer(PreTokenizer):
|
||||||
""" BertPreTokenizer
|
""" BertPreTokenizer
|
||||||
|
|
||||||
This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
||||||
@@ -78,7 +78,7 @@ class BertPreTokenizer:
|
|||||||
""" Instantiate a new BertPreTokenizer """
|
""" Instantiate a new BertPreTokenizer """
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class Metaspace:
|
class Metaspace(PreTokenizer):
|
||||||
""" Metaspace pre-tokenizer
|
""" Metaspace pre-tokenizer
|
||||||
|
|
||||||
This pre-tokenizer replaces any whitespace by the provided replacement character.
|
This pre-tokenizer replaces any whitespace by the provided replacement character.
|
||||||
@@ -100,3 +100,20 @@ class Metaspace:
|
|||||||
lets us treat `hello` exactly like `say hello`.
|
lets us treat `hello` exactly like `say hello`.
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class CharDelimiterSplit(PreTokenizer):
|
||||||
|
""" CharDelimiterSplit PreTokenizer
|
||||||
|
|
||||||
|
This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
|
||||||
|
"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def new(delimiter: str) -> PreTokenizer:
|
||||||
|
""" Instantiate a new CharDelimiterSplit PreTokenizer
|
||||||
|
|
||||||
|
Args:
|
||||||
|
delimiter: str:
|
||||||
|
The delimiter char that will be used to split input
|
||||||
|
"""
|
||||||
|
pass
|
||||||
37
tokenizers/src/pre_tokenizers/delimiter.rs
Normal file
37
tokenizers/src/pre_tokenizers/delimiter.rs
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
use crate::tokenizer::{Offsets, PreTokenizer, Result};
|
||||||
|
|
||||||
|
pub struct CharDelimiterSplit {
|
||||||
|
delimiter: char,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CharDelimiterSplit {
|
||||||
|
pub fn new(delimiter: char) -> Self {
|
||||||
|
CharDelimiterSplit { delimiter }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PreTokenizer for CharDelimiterSplit {
|
||||||
|
fn pre_tokenize(&self, s: &str) -> Result<Vec<(String, Offsets)>> {
|
||||||
|
let mut words = vec![];
|
||||||
|
let mut word = Vec::with_capacity(1000);
|
||||||
|
let mut offset = 0;
|
||||||
|
|
||||||
|
s.chars().for_each(|c| {
|
||||||
|
if c == self.delimiter {
|
||||||
|
if !word.is_empty() {
|
||||||
|
let offsets = (offset - word.len(), offset);
|
||||||
|
words.push((word.drain(0..).collect::<String>(), offsets));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
word.push(c);
|
||||||
|
}
|
||||||
|
offset += 1;
|
||||||
|
});
|
||||||
|
if !word.is_empty() {
|
||||||
|
let offsets = (offset - word.len(), offset);
|
||||||
|
words.push((word.drain(0..).collect::<String>(), offsets));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(words)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,4 +1,5 @@
|
|||||||
pub mod bert;
|
pub mod bert;
|
||||||
pub mod byte_level;
|
pub mod byte_level;
|
||||||
|
pub mod delimiter;
|
||||||
pub mod metaspace;
|
pub mod metaspace;
|
||||||
pub mod whitespace;
|
pub mod whitespace;
|
||||||
|
|||||||
Reference in New Issue
Block a user