mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-05 04:08:22 +00:00
Addressing comments:
- Remote Deduplication in favor of WhitespaceSplit. - Updated comments
This commit is contained in:
committed by
Anthony MOI
parent
1f65b4393c
commit
857948e5b8
@@ -108,20 +108,10 @@ class CharDelimiterSplit(PreTokenizer):
|
||||
"""
|
||||
pass
|
||||
|
||||
class Deduplication(PreTokenizer):
|
||||
""" Deduplication PreTokenizer
|
||||
|
||||
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new Deduplication PreTokenizer """
|
||||
pass
|
||||
|
||||
class Punctuation(PreTokenizer):
|
||||
""" Punctuation PreTokenizer
|
||||
|
||||
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
|
||||
This pre-tokenizer simply splits on punctuation as individual characters.`
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
@@ -131,7 +121,7 @@ class Punctuation(PreTokenizer):
|
||||
class Sequence(PreTokenizer):
|
||||
""" Sequence PreTokenizer
|
||||
|
||||
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
|
||||
This pre-tokenizer composes other pre_tokenizers and applies them in sequence`
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
|
||||
Reference in New Issue
Block a user