Adding a new pre_tokenizer: Digits.

Easier to split on digits:

Digits(individual_digits=False) -> 'Call 123 please' becomes 'Call ',
'123', 'please'
Digits(individual_digits=True) -> 'Call 123 please' becomes 'Call ',
'1', '2', '3', 'please'
This commit is contained in:
Nicolas Patry
2020-09-03 17:27:58 +02:00
parent b8f1eb48cb
commit 7b2caca764
10 changed files with 293 additions and 18 deletions

View File

@@ -9,3 +9,4 @@ WhitespaceSplit = pre_tokenizers.WhitespaceSplit
BertPreTokenizer = pre_tokenizers.BertPreTokenizer
Metaspace = pre_tokenizers.Metaspace
CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit
Digits = pre_tokenizers.Digits

View File

@@ -3,7 +3,7 @@ from typing import Optional, List, Tuple
Offsets = Tuple[int, int]
class PreTokenizer:
""" Base class for all pre-tokenizers
"""Base class for all pre-tokenizers
This class is not supposed to be instantiated directly. Instead, any implementation of a
PreTokenizer will return an instance of this class when instantiated.
@@ -14,14 +14,14 @@ class PreTokenizer:
pass
class ByteLevel(PreTokenizer):
""" ByteLevel PreTokenizer
"""ByteLevel PreTokenizer
This pre-tokenizer takes care of replacing all bytes of the given string
with a corresponding representation, as well as splitting into words.
"""
def __init__(self, add_prefix_space: bool = True) -> None:
""" Instantiate a new ByteLevel PreTokenizer
"""Instantiate a new ByteLevel PreTokenizer
Args:
add_prefix_space: (`optional`) boolean:
Whether to add a space to the first word if there isn't already one. This
@@ -32,7 +32,7 @@ class ByteLevel(PreTokenizer):
pass
@staticmethod
def alphabet() -> List[str]:
""" Returns the alphabet used by this PreTokenizer.
"""Returns the alphabet used by this PreTokenizer.
Since the ByteLevel works as its name suggests, at the byte level, it
encodes any byte to one visible character. This means that there is a
@@ -41,7 +41,7 @@ class ByteLevel(PreTokenizer):
pass
class Whitespace(PreTokenizer):
""" Whitespace PreTokenizer
"""Whitespace PreTokenizer
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
"""
@@ -51,7 +51,7 @@ class Whitespace(PreTokenizer):
pass
class WhitespaceSplit(PreTokenizer):
""" Whitespace PreTokenizer
"""Whitespace PreTokenizer
This pre-tokenizer simply splits on the whitespace. Works like `.split()`
"""
@@ -61,7 +61,7 @@ class WhitespaceSplit(PreTokenizer):
pass
class BertPreTokenizer(PreTokenizer):
""" BertPreTokenizer
"""BertPreTokenizer
This pre-tokenizer splits tokens on spaces, and also on punctuation.
Each occurence of a punctuation character will be treated separately.
@@ -72,14 +72,14 @@ class BertPreTokenizer(PreTokenizer):
pass
class Metaspace(PreTokenizer):
""" Metaspace pre-tokenizer
"""Metaspace pre-tokenizer
This pre-tokenizer replaces any whitespace by the provided replacement character.
It then tries to split on these spaces.
"""
def __init__(self, replacement: str = "", add_prefix_space: bool = True) -> None:
""" Instantiate a new Metaspace
"""Instantiate a new Metaspace
Args:
replacement: str:
@@ -93,14 +93,14 @@ class Metaspace(PreTokenizer):
pass
class CharDelimiterSplit(PreTokenizer):
""" CharDelimiterSplit PreTokenizer
"""CharDelimiterSplit PreTokenizer
This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
"""
@staticmethod
def __init__(self, delimiter: str) -> None:
""" Instantiate a new CharDelimiterSplit PreTokenizer
"""Instantiate a new CharDelimiterSplit PreTokenizer
Args:
delimiter: str:
@@ -109,7 +109,7 @@ class CharDelimiterSplit(PreTokenizer):
pass
class Punctuation(PreTokenizer):
""" Punctuation PreTokenizer
"""Punctuation PreTokenizer
This pre-tokenizer simply splits on punctuation as individual characters.`
"""
@@ -119,7 +119,7 @@ class Punctuation(PreTokenizer):
pass
class Sequence(PreTokenizer):
""" Sequence PreTokenizer
"""Sequence PreTokenizer
This pre-tokenizer composes other pre_tokenizers and applies them in sequence`
"""
@@ -127,3 +127,20 @@ class Sequence(PreTokenizer):
def __init__(self) -> None:
""" Instantiate a new Sequence PreTokenizer """
pass
class Digits(PreTokenizer):
"""Digits PreTokenizer
This pre-tokenizer simply splits using the digits in separate tokens
"""
def __init__(self, individual_digits: bool) -> None:
"""Instantiate a new Digits
Args:
individual_digits: bool:
If set to True, digits will each be separated "Call 123 please" -> "Call ", "1", "2", "3", " please"
If set to False, digits will grouped "Call 123 please" -> "Call ", "123", " please"
"""
pass