mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-06 20:58:22 +00:00
Char based delimiter splitting - TransfoXL (#114)
* WIP delimiter splitter Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * Bind on Python side. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * Add missing delimiter parameter in CharDelimiterSplit constructor. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * Attempt to provide CharDelimiterSplit for node. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * Apply Rust formatting. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * fix bindings node Co-authored-by: Pierric Cistac <Pierrci@users.noreply.github.com>
This commit is contained in:
@@ -40,6 +40,7 @@ fn pre_tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
m.add_class::<pre_tokenizers::WhitespaceSplit>()?;
|
||||
m.add_class::<pre_tokenizers::BertPreTokenizer>()?;
|
||||
m.add_class::<pre_tokenizers::Metaspace>()?;
|
||||
m.add_class::<pre_tokenizers::CharDelimiterSplit>()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -85,6 +85,23 @@ impl WhitespaceSplit {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
pub struct CharDelimiterSplit {}
|
||||
#[pymethods]
|
||||
impl CharDelimiterSplit {
|
||||
#[staticmethod]
|
||||
pub fn new(delimiter: &str) -> PyResult<PreTokenizer> {
|
||||
let chr_delimiter = delimiter.chars().nth(0).ok_or(exceptions::Exception::py_err(
|
||||
"delimiter must be a single character",
|
||||
))?;
|
||||
Ok(PreTokenizer{
|
||||
pretok:Container::Owned(Box::new(
|
||||
tk::pre_tokenizers::delimiter::CharDelimiterSplit::new(chr_delimiter)
|
||||
))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
pub struct BertPreTokenizer {}
|
||||
#[pymethods]
|
||||
|
||||
@@ -6,3 +6,4 @@ Whitespace = pre_tokenizers.Whitespace
|
||||
WhitespaceSplit = pre_tokenizers.WhitespaceSplit
|
||||
BertPreTokenizer = pre_tokenizers.BertPreTokenizer
|
||||
Metaspace = pre_tokenizers.Metaspace
|
||||
CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit
|
||||
|
||||
@@ -13,7 +13,7 @@ class PreTokenizer:
|
||||
""" Pre tokenize the given sequence """
|
||||
pass
|
||||
|
||||
class ByteLevel:
|
||||
class ByteLevel(PreTokenizer):
|
||||
""" ByteLevel PreTokenizer
|
||||
|
||||
This pre-tokenizer takes care of replacing all bytes of the given string
|
||||
@@ -44,7 +44,7 @@ class ByteLevel:
|
||||
"""
|
||||
pass
|
||||
|
||||
class Whitespace:
|
||||
class Whitespace(PreTokenizer):
|
||||
""" Whitespace PreTokenizer
|
||||
|
||||
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
|
||||
@@ -55,7 +55,7 @@ class Whitespace:
|
||||
""" Instantiate a new Whitespace PreTokenizer """
|
||||
pass
|
||||
|
||||
class WhitespaceSplit:
|
||||
class WhitespaceSplit(PreTokenizer):
|
||||
""" Whitespace PreTokenizer
|
||||
|
||||
This pre-tokenizer simply splits on the whitespace. Works like `.split()`
|
||||
@@ -66,7 +66,7 @@ class WhitespaceSplit:
|
||||
""" Instantiate a new WhitespaceSplit PreTokenizer """
|
||||
pass
|
||||
|
||||
class BertPreTokenizer:
|
||||
class BertPreTokenizer(PreTokenizer):
|
||||
""" BertPreTokenizer
|
||||
|
||||
This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
||||
@@ -78,7 +78,7 @@ class BertPreTokenizer:
|
||||
""" Instantiate a new BertPreTokenizer """
|
||||
pass
|
||||
|
||||
class Metaspace:
|
||||
class Metaspace(PreTokenizer):
|
||||
""" Metaspace pre-tokenizer
|
||||
|
||||
This pre-tokenizer replaces any whitespace by the provided replacement character.
|
||||
@@ -100,3 +100,20 @@ class Metaspace:
|
||||
lets us treat `hello` exactly like `say hello`.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class CharDelimiterSplit(PreTokenizer):
|
||||
""" CharDelimiterSplit PreTokenizer
|
||||
|
||||
This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def new(delimiter: str) -> PreTokenizer:
|
||||
""" Instantiate a new CharDelimiterSplit PreTokenizer
|
||||
|
||||
Args:
|
||||
delimiter: str:
|
||||
The delimiter char that will be used to split input
|
||||
"""
|
||||
pass
|
||||
Reference in New Issue
Block a user