mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-08 05:38:23 +00:00
Split Pre-Tokenizer (#542)
* start playing around * make a first version * refactor * apply make format * add python bindings * add some python binding tests * correct pre-tokenizers * update auto-generated bindings * lint python bindings * add code node * add split to docs * refactor python binding a bit * cargo fmt * clippy and fmt in node * quick updates and fixes * Oops * Update node typings * Update changelog Co-authored-by: Anthony MOI <m.anthony.moi@gmail.com>
This commit is contained in:
committed by
GitHub
parent
58e1d8de67
commit
dd399d2ad0
@@ -9,6 +9,7 @@ Digits = pre_tokenizers.Digits
|
||||
Metaspace = pre_tokenizers.Metaspace
|
||||
Punctuation = pre_tokenizers.Punctuation
|
||||
Sequence = pre_tokenizers.Sequence
|
||||
Split = pre_tokenizers.Split
|
||||
UnicodeScripts = pre_tokenizers.UnicodeScripts
|
||||
Whitespace = pre_tokenizers.Whitespace
|
||||
WhitespaceSplit = pre_tokenizers.WhitespaceSplit
|
||||
|
||||
@@ -392,6 +392,40 @@ class Sequence(PreTokenizer):
|
||||
"""
|
||||
pass
|
||||
|
||||
class Split(PreTokenizer):
|
||||
"""
|
||||
Split PreTokenizer
|
||||
|
||||
This versatile pre-tokenizer splits using the provided pattern and
|
||||
according to the provided behavior. The pattern can be inverted by
|
||||
making use of the invert flag.
|
||||
|
||||
Args:
|
||||
pattern (:obj:`str` or :class:`~tokenizers.Regex`):
|
||||
A pattern used to split the string. Usually a string or a Regex
|
||||
|
||||
behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
|
||||
The behavior to use when splitting.
|
||||
Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
|
||||
"contiguous"
|
||||
|
||||
invert (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether to invert the pattern.
|
||||
"""
|
||||
|
||||
def __init__(self, pattern, behavior, invert=False):
|
||||
pass
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
class UnicodeScripts(PreTokenizer):
|
||||
"""
|
||||
This pre-tokenizer splits on characters that belong to different language family
|
||||
|
||||
Reference in New Issue
Block a user