mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 16:49:27 +00:00
Upgrading to black 20.8b1
This commit is contained in:
committed by
Anthony MOI
parent
dc1d0711cf
commit
a410903051
2
.github/workflows/python.yml
vendored
2
.github/workflows/python.yml
vendored
@ -25,7 +25,7 @@ jobs:
|
|||||||
architecture: "x64"
|
architecture: "x64"
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: pip install black==19.10b0
|
run: pip install black==20.8b1
|
||||||
|
|
||||||
- name: Check style
|
- name: Check style
|
||||||
working-directory: ./bindings/python
|
working-directory: ./bindings/python
|
||||||
|
@ -70,13 +70,17 @@ elif args.type == "bert":
|
|||||||
|
|
||||||
tok_r = Tokenizer(WordPiece(args.vocab, unk_token="[UNK]", max_input_chars_per_word=100))
|
tok_r = Tokenizer(WordPiece(args.vocab, unk_token="[UNK]", max_input_chars_per_word=100))
|
||||||
tok_r.normalizer = BertNormalizer(
|
tok_r.normalizer = BertNormalizer(
|
||||||
clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True,
|
clean_text=True,
|
||||||
|
handle_chinese_chars=True,
|
||||||
|
strip_accents=True,
|
||||||
|
lowercase=True,
|
||||||
)
|
)
|
||||||
# tok_r.pre_tokenizer = pre_tokenizers.Whitespace()
|
# tok_r.pre_tokenizer = pre_tokenizers.Whitespace()
|
||||||
tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
|
tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
|
||||||
tok_r.decoder = decoders.WordPiece()
|
tok_r.decoder = decoders.WordPiece()
|
||||||
tok_r.post_processor = BertProcessing(
|
tok_r.post_processor = BertProcessing(
|
||||||
("[SEP]", tok_r.token_to_id("[SEP]")), ("[CLS]", tok_r.token_to_id("[CLS]")),
|
("[SEP]", tok_r.token_to_id("[SEP]")),
|
||||||
|
("[CLS]", tok_r.token_to_id("[CLS]")),
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
raise Exception(f"Unknown type {args.type}")
|
raise Exception(f"Unknown type {args.type}")
|
||||||
|
@ -32,7 +32,10 @@ if not files:
|
|||||||
|
|
||||||
# Initialize an empty tokenizer
|
# Initialize an empty tokenizer
|
||||||
tokenizer = BertWordPieceTokenizer(
|
tokenizer = BertWordPieceTokenizer(
|
||||||
clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True,
|
clean_text=True,
|
||||||
|
handle_chinese_chars=True,
|
||||||
|
strip_accents=True,
|
||||||
|
lowercase=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
# And then train
|
# And then train
|
||||||
|
@ -9,7 +9,8 @@ TextInputSequence = str
|
|||||||
PreTokenizedInputSequence = Union[List[str], Tuple[str]]
|
PreTokenizedInputSequence = Union[List[str], Tuple[str]]
|
||||||
TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
|
TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
|
||||||
PreTokenizedEncodeInput = Union[
|
PreTokenizedEncodeInput = Union[
|
||||||
PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
|
PreTokenizedInputSequence,
|
||||||
|
Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
|
||||||
]
|
]
|
||||||
|
|
||||||
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
|
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
|
||||||
|
@ -21,7 +21,8 @@ TextInputSequence = str
|
|||||||
PreTokenizedInputSequence = Union[List[str], Tuple[str]]
|
PreTokenizedInputSequence = Union[List[str], Tuple[str]]
|
||||||
TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
|
TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
|
||||||
PreTokenizedEncodeInput = Union[
|
PreTokenizedEncodeInput = Union[
|
||||||
PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
|
PreTokenizedInputSequence,
|
||||||
|
Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
|
||||||
]
|
]
|
||||||
|
|
||||||
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
|
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
|
||||||
@ -52,7 +53,7 @@ Range = Union[int, Tuple[int, int], slice]
|
|||||||
Pattern = Union[str, Regex]
|
Pattern = Union[str, Regex]
|
||||||
|
|
||||||
class PreTokenizedString:
|
class PreTokenizedString:
|
||||||
""" PreTokenizedString
|
"""PreTokenizedString
|
||||||
|
|
||||||
Wrapper over a string, that provides a way to normalize, pre-tokenize, tokenize the
|
Wrapper over a string, that provides a way to normalize, pre-tokenize, tokenize the
|
||||||
underlying string, while keeping track of the alignment information (offsets).
|
underlying string, while keeping track of the alignment information (offsets).
|
||||||
@ -66,7 +67,7 @@ class PreTokenizedString:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __new__(sequence: str) -> PreTokenizedString:
|
def __new__(sequence: str) -> PreTokenizedString:
|
||||||
""" Instantiate a new PreTokenizedString using the given str
|
"""Instantiate a new PreTokenizedString using the given str
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
sequence: str:
|
sequence: str:
|
||||||
@ -74,7 +75,7 @@ class PreTokenizedString:
|
|||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def split(self, func: Callable[[index, NormalizedString], List[NormalizedString]]):
|
def split(self, func: Callable[[index, NormalizedString], List[NormalizedString]]):
|
||||||
""" Split the PreTokenizedString using the given `func`
|
"""Split the PreTokenizedString using the given `func`
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
func: Callable[[index, NormalizedString], List[NormalizedString]]:
|
func: Callable[[index, NormalizedString], List[NormalizedString]]:
|
||||||
@ -87,7 +88,7 @@ class PreTokenizedString:
|
|||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def normalize(self, func: Callable[[NormalizedString], None]):
|
def normalize(self, func: Callable[[NormalizedString], None]):
|
||||||
""" Normalize each split of the `PreTokenizedString` using the given `func`
|
"""Normalize each split of the `PreTokenizedString` using the given `func`
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
func: Callable[[NormalizedString], None]:
|
func: Callable[[NormalizedString], None]:
|
||||||
@ -97,7 +98,7 @@ class PreTokenizedString:
|
|||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def tokenize(self, func: Callable[[str], List[Token]]):
|
def tokenize(self, func: Callable[[str], List[Token]]):
|
||||||
""" Tokenize each split of the `PreTokenizedString` using the given `func`
|
"""Tokenize each split of the `PreTokenizedString` using the given `func`
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
func: Callable[[str], List[Token]]:
|
func: Callable[[str], List[Token]]:
|
||||||
@ -106,7 +107,7 @@ class PreTokenizedString:
|
|||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def to_encoding(self, type_id: int = 0, word_idx: Optional[int] = None) -> Encoding:
|
def to_encoding(self, type_id: int = 0, word_idx: Optional[int] = None) -> Encoding:
|
||||||
""" Return an Encoding generated from this PreTokenizedString
|
"""Return an Encoding generated from this PreTokenizedString
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
type_id: int = 0:
|
type_id: int = 0:
|
||||||
@ -126,7 +127,7 @@ class PreTokenizedString:
|
|||||||
offset_referential: OffsetReferential = OffsetReferential.ORIGINAL,
|
offset_referential: OffsetReferential = OffsetReferential.ORIGINAL,
|
||||||
offset_type: OffsetType = OffsetType.CHAR,
|
offset_type: OffsetType = OffsetType.CHAR,
|
||||||
) -> List[Split]:
|
) -> List[Split]:
|
||||||
""" Get the splits currently managed by the PreTokenizedString
|
"""Get the splits currently managed by the PreTokenizedString
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
offset_referential: OffsetReferential:
|
offset_referential: OffsetReferential:
|
||||||
@ -145,7 +146,7 @@ class PreTokenizedString:
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
class NormalizedString:
|
class NormalizedString:
|
||||||
""" NormalizedString
|
"""NormalizedString
|
||||||
|
|
||||||
A NormalizedString takes care of modifying an "original" string, to obtain a "normalized" one.
|
A NormalizedString takes care of modifying an "original" string, to obtain a "normalized" one.
|
||||||
While making all the requested modifications, it keeps track of the alignment information
|
While making all the requested modifications, it keeps track of the alignment information
|
||||||
@ -153,7 +154,7 @@ class NormalizedString:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __new__(sequence: str) -> NormalizedString:
|
def __new__(sequence: str) -> NormalizedString:
|
||||||
""" Instantiate a new NormalizedString using the given str
|
"""Instantiate a new NormalizedString using the given str
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
sequence: str:
|
sequence: str:
|
||||||
@ -214,14 +215,14 @@ class NormalizedString:
|
|||||||
""" Calls the given function for each character of the string """
|
""" Calls the given function for each character of the string """
|
||||||
pass
|
pass
|
||||||
def map(self, func: Callable[[str], str]):
|
def map(self, func: Callable[[str], str]):
|
||||||
""" Calls the given function for each character of the string
|
"""Calls the given function for each character of the string
|
||||||
|
|
||||||
Replaces each character of the string using the returned value. Each
|
Replaces each character of the string using the returned value. Each
|
||||||
returned value **must** be a str of length 1 (ie a character).
|
returned value **must** be a str of length 1 (ie a character).
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def split(self, pattern: Pattern, behavior: SplitDelimiterBehavior) -> List[NormalizedString]:
|
def split(self, pattern: Pattern, behavior: SplitDelimiterBehavior) -> List[NormalizedString]:
|
||||||
""" Split the NormalizedString using the given pattern and the specified behavior
|
"""Split the NormalizedString using the given pattern and the specified behavior
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
pattern: Pattern:
|
pattern: Pattern:
|
||||||
@ -235,7 +236,7 @@ class NormalizedString:
|
|||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def replace(self, pattern: Pattern, content: str):
|
def replace(self, pattern: Pattern, content: str):
|
||||||
""" Replace the content of the given pattern with the provided content
|
"""Replace the content of the given pattern with the provided content
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
pattern: Pattern:
|
pattern: Pattern:
|
||||||
@ -827,7 +828,10 @@ class Tokenizer:
|
|||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def post_process(
|
def post_process(
|
||||||
self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True,
|
self,
|
||||||
|
encoding: Encoding,
|
||||||
|
pair: Optional[Encoding] = None,
|
||||||
|
add_special_tokens: bool = True,
|
||||||
) -> Encoding:
|
) -> Encoding:
|
||||||
"""Apply all the post-processing steps to the given encodings.
|
"""Apply all the post-processing steps to the given encodings.
|
||||||
|
|
||||||
|
@ -21,7 +21,10 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, vocab: Optional[str] = None, replacement: str = "▁", add_prefix_space: bool = True,
|
self,
|
||||||
|
vocab: Optional[str] = None,
|
||||||
|
replacement: str = "▁",
|
||||||
|
add_prefix_space: bool = True,
|
||||||
):
|
):
|
||||||
if vocab is not None:
|
if vocab is not None:
|
||||||
# Let Unigram(..) fail if only one of them is None
|
# Let Unigram(..) fail if only one of them is None
|
||||||
@ -29,7 +32,12 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
|
|||||||
else:
|
else:
|
||||||
tokenizer = Tokenizer(Unigram())
|
tokenizer = Tokenizer(Unigram())
|
||||||
|
|
||||||
tokenizer.normalizer = normalizers.Sequence([normalizers.Nmt(), normalizers.NFKC(),])
|
tokenizer.normalizer = normalizers.Sequence(
|
||||||
|
[
|
||||||
|
normalizers.Nmt(),
|
||||||
|
normalizers.NFKC(),
|
||||||
|
]
|
||||||
|
)
|
||||||
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
|
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
|
||||||
[
|
[
|
||||||
pre_tokenizers.WhitespaceSplit(),
|
pre_tokenizers.WhitespaceSplit(),
|
||||||
@ -60,7 +68,9 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
|
|||||||
""" Train the model using the given files """
|
""" Train the model using the given files """
|
||||||
|
|
||||||
trainer = trainers.UnigramTrainer(
|
trainer = trainers.UnigramTrainer(
|
||||||
vocab_size=vocab_size, special_tokens=special_tokens, show_progress=show_progress,
|
vocab_size=vocab_size,
|
||||||
|
special_tokens=special_tokens,
|
||||||
|
show_progress=show_progress,
|
||||||
)
|
)
|
||||||
|
|
||||||
if isinstance(files, str):
|
if isinstance(files, str):
|
||||||
|
@ -19,7 +19,10 @@ class TestBPE:
|
|||||||
BPE(vocab=vocab)
|
BPE(vocab=vocab)
|
||||||
BPE(merges=merges)
|
BPE(merges=merges)
|
||||||
|
|
||||||
assert isinstance(pickle.loads(pickle.dumps(BPE(vocab, merges))), BPE,)
|
assert isinstance(
|
||||||
|
pickle.loads(pickle.dumps(BPE(vocab, merges))),
|
||||||
|
BPE,
|
||||||
|
)
|
||||||
|
|
||||||
# Deprecated calls in 0.9
|
# Deprecated calls in 0.9
|
||||||
with pytest.deprecated_call():
|
with pytest.deprecated_call():
|
||||||
|
@ -22,7 +22,8 @@ class TestBertProcessing:
|
|||||||
assert isinstance(processor, PostProcessor)
|
assert isinstance(processor, PostProcessor)
|
||||||
assert isinstance(processor, BertProcessing)
|
assert isinstance(processor, BertProcessing)
|
||||||
assert isinstance(
|
assert isinstance(
|
||||||
pickle.loads(pickle.dumps(BertProcessing(("[SEP]", 0), ("[CLS]", 1)))), BertProcessing,
|
pickle.loads(pickle.dumps(BertProcessing(("[SEP]", 0), ("[CLS]", 1)))),
|
||||||
|
BertProcessing,
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_processing(self):
|
def test_processing(self):
|
||||||
@ -94,7 +95,9 @@ class TestTemplateProcessing:
|
|||||||
|
|
||||||
def get_roberta(self):
|
def get_roberta(self):
|
||||||
return TemplateProcessing(
|
return TemplateProcessing(
|
||||||
seq_a="<s> $0 </s>", seq_b="</s> $0 </s>", special_tokens=[("<s>", 0), ("</s>", 1)],
|
seq_a="<s> $0 </s>",
|
||||||
|
seq_b="</s> $0 </s>",
|
||||||
|
special_tokens=[("<s>", 0), ("</s>", 1)],
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_t5_squad(self):
|
def get_t5_squad(self):
|
||||||
|
@ -232,10 +232,12 @@ class TestTokenizer:
|
|||||||
|
|
||||||
# Numpy
|
# Numpy
|
||||||
test_single(
|
test_single(
|
||||||
np.array([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]]), True,
|
np.array([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]]),
|
||||||
|
True,
|
||||||
)
|
)
|
||||||
test_single(
|
test_single(
|
||||||
np.array((("My", "name", "is", "John"), ("My", "name", "is", "Georges"))), True,
|
np.array((("My", "name", "is", "John"), ("My", "name", "is", "Georges"))),
|
||||||
|
True,
|
||||||
)
|
)
|
||||||
test_pair(
|
test_pair(
|
||||||
np.array(
|
np.array(
|
||||||
@ -276,7 +278,8 @@ class TestTokenizer:
|
|||||||
|
|
||||||
tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True)
|
tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True)
|
||||||
tokenizer.post_processor = RobertaProcessing(
|
tokenizer.post_processor = RobertaProcessing(
|
||||||
("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")),
|
("</s>", tokenizer.token_to_id("</s>")),
|
||||||
|
("<s>", tokenizer.token_to_id("<s>")),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Can encode with special tokens
|
# Can encode with special tokens
|
||||||
|
@ -65,7 +65,10 @@ class TestByteLevelBPE:
|
|||||||
|
|
||||||
def test_lowerspace(self, roberta_files):
|
def test_lowerspace(self, roberta_files):
|
||||||
tokenizer = ByteLevelBPETokenizer.from_file(
|
tokenizer = ByteLevelBPETokenizer.from_file(
|
||||||
roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True,
|
roberta_files["vocab"],
|
||||||
|
roberta_files["merges"],
|
||||||
|
add_prefix_space=True,
|
||||||
|
lowercase=True,
|
||||||
)
|
)
|
||||||
output = tokenizer.encode("The Quick Brown Fox Jumps Over The Lazy Dog")
|
output = tokenizer.encode("The Quick Brown Fox Jumps Over The Lazy Dog")
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user