diff --git a/bindings/python/src/encoding.rs b/bindings/python/src/encoding.rs index a93d29e7..ac0be33d 100644 --- a/bindings/python/src/encoding.rs +++ b/bindings/python/src/encoding.rs @@ -109,7 +109,7 @@ impl Encoding { impl PyObjectProtocol for Encoding { fn __repr__(&self) -> PyResult { Ok(format!( - "Encoding(num_tokens={}, attributs=[ids, type_ids, tokens, offsets, \ + "Encoding(num_tokens={}, attributes=[ids, type_ids, tokens, offsets, \ attention_mask, special_tokens_mask, overflowing, original_str, normalized_str])", self.encoding.get_ids().len() )) diff --git a/bindings/python/tokenizers/implementations/base_tokenizer.py b/bindings/python/tokenizers/implementations/base_tokenizer.py index e8ac08c1..2c3d9432 100644 --- a/bindings/python/tokenizers/implementations/base_tokenizer.py +++ b/bindings/python/tokenizers/implementations/base_tokenizer.py @@ -12,7 +12,7 @@ class BaseTokenizer: def __repr__(self): return "Tokenizer(vocabulary_size={}, {})".format( self._tokenizer.get_vocab_size(), - ', '.join(k + ': ' + str(v) for k, v in self._parameters.items())) + ', '.join(k + '=' + str(v) for k, v in self._parameters.items())) def enable_padding(self, direction: Optional[str] = "right", diff --git a/bindings/python/tokenizers/implementations/bert_wordpiece.py b/bindings/python/tokenizers/implementations/bert_wordpiece.py index c3564c21..a7d89f04 100644 --- a/bindings/python/tokenizers/implementations/bert_wordpiece.py +++ b/bindings/python/tokenizers/implementations/bert_wordpiece.py @@ -33,14 +33,14 @@ class BertWordPieceTokenizer(BaseTokenizer): lowercase=lowercase) tokenizer.pre_tokenizer = BertPreTokenizer.new() - sep_token_id = tokenizer.token_to_id(sep_token) - if sep_token_id is None: - raise TypeError("sep_token not found in the vocabulary") - cls_token_id = tokenizer.token_to_id(cls_token) - if cls_token_id is None: - raise TypeError("cls_token not found in the vocabulary") + if add_special_tokens and vocab_file is not None: + sep_token_id = tokenizer.token_to_id(sep_token) + if sep_token_id is None: + raise TypeError("sep_token not found in the vocabulary") + cls_token_id = tokenizer.token_to_id(cls_token) + if cls_token_id is None: + raise TypeError("cls_token not found in the vocabulary") - if add_special_tokens: tokenizer.post_processor = BertProcessing.new( (sep_token, sep_token_id), (cls_token, cls_token_id) @@ -81,4 +81,6 @@ class BertWordPieceTokenizer(BaseTokenizer): show_progress=show_progress, continuing_subword_prefix=wordpieces_prefix ) + if isinstance(files, str): + files = [files] self._tokenizer.train(trainer, files) diff --git a/bindings/python/tokenizers/implementations/bpe.py b/bindings/python/tokenizers/implementations/bpe.py index 70c86860..eae03502 100644 --- a/bindings/python/tokenizers/implementations/bpe.py +++ b/bindings/python/tokenizers/implementations/bpe.py @@ -61,4 +61,6 @@ class BPETokenizer(BaseTokenizer): end_of_word_suffix=suffix, show_progress=show_progress ) + if isinstance(files, str): + files = [files] self._tokenizer.train(trainer, files) diff --git a/bindings/python/tokenizers/implementations/byte_level_bpe.py b/bindings/python/tokenizers/implementations/byte_level_bpe.py index 118e516c..141cb9cd 100644 --- a/bindings/python/tokenizers/implementations/byte_level_bpe.py +++ b/bindings/python/tokenizers/implementations/byte_level_bpe.py @@ -45,4 +45,6 @@ class ByteLevelBPETokenizer(BaseTokenizer): special_tokens=special_tokens, initial_alphabet=pre_tokenizers.ByteLevel.alphabet() ) + if isinstance(files, str): + files = [files] self._tokenizer.train(trainer, files) diff --git a/bindings/python/tokenizers/implementations/sentencepiece_bpe.py b/bindings/python/tokenizers/implementations/sentencepiece_bpe.py index 65c26bd7..733024b5 100644 --- a/bindings/python/tokenizers/implementations/sentencepiece_bpe.py +++ b/bindings/python/tokenizers/implementations/sentencepiece_bpe.py @@ -59,4 +59,6 @@ class SentencePieceBPETokenizer(BaseTokenizer): initial_alphabet=initial_alphabet, show_progress=show_progress ) + if isinstance(files, str): + files = [files] self._tokenizer.train(trainer, files)