mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-05 12:18:20 +00:00
Adding a new pre_tokenizer: Digits.
Easier to split on digits: Digits(individual_digits=False) -> 'Call 123 please' becomes 'Call ', '123', 'please' Digits(individual_digits=True) -> 'Call 123 please' becomes 'Call ', '1', '2', '3', 'please'
This commit is contained in:
@@ -9,3 +9,4 @@ WhitespaceSplit = pre_tokenizers.WhitespaceSplit
|
||||
BertPreTokenizer = pre_tokenizers.BertPreTokenizer
|
||||
Metaspace = pre_tokenizers.Metaspace
|
||||
CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit
|
||||
Digits = pre_tokenizers.Digits
|
||||
|
||||
@@ -3,7 +3,7 @@ from typing import Optional, List, Tuple
|
||||
Offsets = Tuple[int, int]
|
||||
|
||||
class PreTokenizer:
|
||||
""" Base class for all pre-tokenizers
|
||||
"""Base class for all pre-tokenizers
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of a
|
||||
PreTokenizer will return an instance of this class when instantiated.
|
||||
@@ -14,14 +14,14 @@ class PreTokenizer:
|
||||
pass
|
||||
|
||||
class ByteLevel(PreTokenizer):
|
||||
""" ByteLevel PreTokenizer
|
||||
"""ByteLevel PreTokenizer
|
||||
|
||||
This pre-tokenizer takes care of replacing all bytes of the given string
|
||||
with a corresponding representation, as well as splitting into words.
|
||||
"""
|
||||
|
||||
def __init__(self, add_prefix_space: bool = True) -> None:
|
||||
""" Instantiate a new ByteLevel PreTokenizer
|
||||
"""Instantiate a new ByteLevel PreTokenizer
|
||||
Args:
|
||||
add_prefix_space: (`optional`) boolean:
|
||||
Whether to add a space to the first word if there isn't already one. This
|
||||
@@ -32,7 +32,7 @@ class ByteLevel(PreTokenizer):
|
||||
pass
|
||||
@staticmethod
|
||||
def alphabet() -> List[str]:
|
||||
""" Returns the alphabet used by this PreTokenizer.
|
||||
"""Returns the alphabet used by this PreTokenizer.
|
||||
|
||||
Since the ByteLevel works as its name suggests, at the byte level, it
|
||||
encodes any byte to one visible character. This means that there is a
|
||||
@@ -41,7 +41,7 @@ class ByteLevel(PreTokenizer):
|
||||
pass
|
||||
|
||||
class Whitespace(PreTokenizer):
|
||||
""" Whitespace PreTokenizer
|
||||
"""Whitespace PreTokenizer
|
||||
|
||||
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
|
||||
"""
|
||||
@@ -51,7 +51,7 @@ class Whitespace(PreTokenizer):
|
||||
pass
|
||||
|
||||
class WhitespaceSplit(PreTokenizer):
|
||||
""" Whitespace PreTokenizer
|
||||
"""Whitespace PreTokenizer
|
||||
|
||||
This pre-tokenizer simply splits on the whitespace. Works like `.split()`
|
||||
"""
|
||||
@@ -61,7 +61,7 @@ class WhitespaceSplit(PreTokenizer):
|
||||
pass
|
||||
|
||||
class BertPreTokenizer(PreTokenizer):
|
||||
""" BertPreTokenizer
|
||||
"""BertPreTokenizer
|
||||
|
||||
This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
||||
Each occurence of a punctuation character will be treated separately.
|
||||
@@ -72,14 +72,14 @@ class BertPreTokenizer(PreTokenizer):
|
||||
pass
|
||||
|
||||
class Metaspace(PreTokenizer):
|
||||
""" Metaspace pre-tokenizer
|
||||
"""Metaspace pre-tokenizer
|
||||
|
||||
This pre-tokenizer replaces any whitespace by the provided replacement character.
|
||||
It then tries to split on these spaces.
|
||||
"""
|
||||
|
||||
def __init__(self, replacement: str = "▁", add_prefix_space: bool = True) -> None:
|
||||
""" Instantiate a new Metaspace
|
||||
"""Instantiate a new Metaspace
|
||||
|
||||
Args:
|
||||
replacement: str:
|
||||
@@ -93,14 +93,14 @@ class Metaspace(PreTokenizer):
|
||||
pass
|
||||
|
||||
class CharDelimiterSplit(PreTokenizer):
|
||||
""" CharDelimiterSplit PreTokenizer
|
||||
"""CharDelimiterSplit PreTokenizer
|
||||
|
||||
This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def __init__(self, delimiter: str) -> None:
|
||||
""" Instantiate a new CharDelimiterSplit PreTokenizer
|
||||
"""Instantiate a new CharDelimiterSplit PreTokenizer
|
||||
|
||||
Args:
|
||||
delimiter: str:
|
||||
@@ -109,7 +109,7 @@ class CharDelimiterSplit(PreTokenizer):
|
||||
pass
|
||||
|
||||
class Punctuation(PreTokenizer):
|
||||
""" Punctuation PreTokenizer
|
||||
"""Punctuation PreTokenizer
|
||||
|
||||
This pre-tokenizer simply splits on punctuation as individual characters.`
|
||||
"""
|
||||
@@ -119,7 +119,7 @@ class Punctuation(PreTokenizer):
|
||||
pass
|
||||
|
||||
class Sequence(PreTokenizer):
|
||||
""" Sequence PreTokenizer
|
||||
"""Sequence PreTokenizer
|
||||
|
||||
This pre-tokenizer composes other pre_tokenizers and applies them in sequence`
|
||||
"""
|
||||
@@ -127,3 +127,20 @@ class Sequence(PreTokenizer):
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new Sequence PreTokenizer """
|
||||
pass
|
||||
|
||||
class Digits(PreTokenizer):
|
||||
"""Digits PreTokenizer
|
||||
|
||||
This pre-tokenizer simply splits using the digits in separate tokens
|
||||
"""
|
||||
|
||||
def __init__(self, individual_digits: bool) -> None:
|
||||
"""Instantiate a new Digits
|
||||
|
||||
Args:
|
||||
individual_digits: bool:
|
||||
If set to True, digits will each be separated "Call 123 please" -> "Call ", "1", "2", "3", " please"
|
||||
If set to False, digits will grouped "Call 123 please" -> "Call ", "123", " please"
|
||||
|
||||
"""
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user