mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-05 12:18:20 +00:00
Adding a 3 new PreTokenizers:
- Deduplication : Removes duplicate spaces within strings - Punctuation: Splits punctuation characters as isolated tokens - Sequence: Applies a list of pretokenizers iteratively
This commit is contained in:
committed by
Anthony MOI
parent
50ac90d338
commit
7ed7f0f26a
@@ -3,6 +3,9 @@ from .. import pre_tokenizers
|
||||
PreTokenizer = pre_tokenizers.PreTokenizer
|
||||
ByteLevel = pre_tokenizers.ByteLevel
|
||||
Whitespace = pre_tokenizers.Whitespace
|
||||
Deduplication = pre_tokenizers.Deduplication
|
||||
Punctuation = pre_tokenizers.Punctuation
|
||||
Sequence = pre_tokenizers.Sequence
|
||||
WhitespaceSplit = pre_tokenizers.WhitespaceSplit
|
||||
BertPreTokenizer = pre_tokenizers.BertPreTokenizer
|
||||
Metaspace = pre_tokenizers.Metaspace
|
||||
|
||||
@@ -107,3 +107,34 @@ class CharDelimiterSplit(PreTokenizer):
|
||||
The delimiter char that will be used to split input
|
||||
"""
|
||||
pass
|
||||
|
||||
class Deduplication(PreTokenizer):
|
||||
""" Deduplication PreTokenizer
|
||||
|
||||
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new Deduplication PreTokenizer """
|
||||
pass
|
||||
|
||||
class Punctuation(PreTokenizer):
|
||||
""" Punctuation PreTokenizer
|
||||
|
||||
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new Punctuation PreTokenizer """
|
||||
pass
|
||||
|
||||
|
||||
class Sequence(PreTokenizer):
|
||||
""" Sequence PreTokenizer
|
||||
|
||||
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new Sequence PreTokenizer """
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user