Adding a 3 new PreTokenizers:

- Deduplication : Removes duplicate spaces within strings
- Punctuation: Splits punctuation characters as isolated tokens
- Sequence: Applies a list of pretokenizers iteratively
This commit is contained in:
Nicolas Patry
2020-08-21 16:37:38 +02:00
committed by Anthony MOI
parent 50ac90d338
commit 7ed7f0f26a
9 changed files with 341 additions and 4 deletions

View File

@@ -64,6 +64,9 @@ fn pre_tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<pre_tokenizers::PyBertPreTokenizer>()?;
m.add_class::<pre_tokenizers::PyMetaspace>()?;
m.add_class::<pre_tokenizers::PyCharDelimiterSplit>()?;
m.add_class::<pre_tokenizers::PyDeduplication>()?;
m.add_class::<pre_tokenizers::PyPunctuation>()?;
m.add_class::<pre_tokenizers::PySequence>()?;
Ok(())
}