Adding a 3 new PreTokenizers:

- Deduplication : Removes duplicate spaces within strings
- Punctuation: Splits punctuation characters as isolated tokens
- Sequence: Applies a list of pretokenizers iteratively
This commit is contained in:
Nicolas Patry
2020-08-21 16:37:38 +02:00
committed by Anthony MOI
parent 50ac90d338
commit 7ed7f0f26a
9 changed files with 341 additions and 4 deletions

View File

@@ -3,6 +3,9 @@ from .. import pre_tokenizers
PreTokenizer = pre_tokenizers.PreTokenizer
ByteLevel = pre_tokenizers.ByteLevel
Whitespace = pre_tokenizers.Whitespace
Deduplication = pre_tokenizers.Deduplication
Punctuation = pre_tokenizers.Punctuation
Sequence = pre_tokenizers.Sequence
WhitespaceSplit = pre_tokenizers.WhitespaceSplit
BertPreTokenizer = pre_tokenizers.BertPreTokenizer
Metaspace = pre_tokenizers.Metaspace

View File

@@ -107,3 +107,34 @@ class CharDelimiterSplit(PreTokenizer):
The delimiter char that will be used to split input
"""
pass
class Deduplication(PreTokenizer):
""" Deduplication PreTokenizer
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
"""
def __init__(self) -> None:
""" Instantiate a new Deduplication PreTokenizer """
pass
class Punctuation(PreTokenizer):
""" Punctuation PreTokenizer
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
"""
def __init__(self) -> None:
""" Instantiate a new Punctuation PreTokenizer """
pass
class Sequence(PreTokenizer):
""" Sequence PreTokenizer
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
"""
def __init__(self) -> None:
""" Instantiate a new Sequence PreTokenizer """
pass