We use 19.10b0 not 20 here...

This commit is contained in:
Nicolas Patry
2020-09-23 11:58:15 +02:00
parent 4705fa8a00
commit 9672995a56
9 changed files with 16 additions and 50 deletions

View File

@@ -9,8 +9,7 @@ TextInputSequence = str
PreTokenizedInputSequence = Union[List[str], Tuple[str]]
TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
PreTokenizedEncodeInput = Union[
PreTokenizedInputSequence,
Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
]
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]

View File

@@ -21,8 +21,7 @@ TextInputSequence = str
PreTokenizedInputSequence = Union[List[str], Tuple[str]]
TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
PreTokenizedEncodeInput = Union[
PreTokenizedInputSequence,
Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
]
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
@@ -828,10 +827,7 @@ class Tokenizer:
"""
pass
def post_process(
self,
encoding: Encoding,
pair: Optional[Encoding] = None,
add_special_tokens: bool = True,
self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True,
) -> Encoding:
"""Apply all the post-processing steps to the given encodings.

View File

@@ -21,10 +21,7 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
"""
def __init__(
self,
vocab: Optional[str] = None,
replacement: str = "",
add_prefix_space: bool = True,
self, vocab: Optional[str] = None, replacement: str = "", add_prefix_space: bool = True,
):
if vocab is not None:
# Let Unigram(..) fail if only one of them is None
@@ -32,12 +29,7 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
else:
tokenizer = Tokenizer(Unigram())
tokenizer.normalizer = normalizers.Sequence(
[
normalizers.Nmt(),
normalizers.NFKC(),
]
)
tokenizer.normalizer = normalizers.Sequence([normalizers.Nmt(), normalizers.NFKC(),])
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
[
pre_tokenizers.WhitespaceSplit(),
@@ -68,9 +60,7 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
""" Train the model using the given files """
trainer = trainers.UnigramTrainer(
vocab_size=vocab_size,
special_tokens=special_tokens,
show_progress=show_progress,
vocab_size=vocab_size, special_tokens=special_tokens, show_progress=show_progress,
)
if isinstance(files, str):