Adding a 3 new PreTokenizers:

- Deduplication : Removes duplicate spaces within strings - Punctuation: Splits punctuation characters as isolated tokens - Sequence: Applies a list of pretokenizers iteratively
2025-12-06 12:48:18 +00:00 · 2020-08-21 16:37:38 +02:00
parent 50ac90d338
commit 7ed7f0f26a
9 changed files with 341 additions and 4 deletions
--- a/bindings/python/src/lib.rs
+++ b/bindings/python/src/lib.rs
@@ -64,6 +64,9 @@ fn pre_tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
    m.add_class::<pre_tokenizers::PyBertPreTokenizer>()?;
    m.add_class::<pre_tokenizers::PyMetaspace>()?;
    m.add_class::<pre_tokenizers::PyCharDelimiterSplit>()?;
+    m.add_class::<pre_tokenizers::PyDeduplication>()?;
+    m.add_class::<pre_tokenizers::PyPunctuation>()?;
+    m.add_class::<pre_tokenizers::PySequence>()?;
    Ok(())
 }