mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-06 12:48:18 +00:00
Adding a 3 new PreTokenizers:
- Deduplication : Removes duplicate spaces within strings - Punctuation: Splits punctuation characters as isolated tokens - Sequence: Applies a list of pretokenizers iteratively
This commit is contained in:
committed by
Anthony MOI
parent
50ac90d338
commit
7ed7f0f26a
@@ -64,6 +64,9 @@ fn pre_tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
m.add_class::<pre_tokenizers::PyBertPreTokenizer>()?;
|
||||
m.add_class::<pre_tokenizers::PyMetaspace>()?;
|
||||
m.add_class::<pre_tokenizers::PyCharDelimiterSplit>()?;
|
||||
m.add_class::<pre_tokenizers::PyDeduplication>()?;
|
||||
m.add_class::<pre_tokenizers::PyPunctuation>()?;
|
||||
m.add_class::<pre_tokenizers::PySequence>()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user