Merge pull request #48 from huggingface/fix-python-stuff

Fix a few python stuff
This commit is contained in:
MOI Anthony
2020-01-10 10:03:02 -05:00
committed by GitHub
6 changed files with 17 additions and 9 deletions

View File

@ -109,7 +109,7 @@ impl Encoding {
impl PyObjectProtocol for Encoding {
fn __repr__(&self) -> PyResult<String> {
Ok(format!(
"Encoding(num_tokens={}, attributs=[ids, type_ids, tokens, offsets, \
"Encoding(num_tokens={}, attributes=[ids, type_ids, tokens, offsets, \
attention_mask, special_tokens_mask, overflowing, original_str, normalized_str])",
self.encoding.get_ids().len()
))

View File

@ -12,7 +12,7 @@ class BaseTokenizer:
def __repr__(self):
return "Tokenizer(vocabulary_size={}, {})".format(
self._tokenizer.get_vocab_size(),
', '.join(k + ': ' + str(v) for k, v in self._parameters.items()))
', '.join(k + '=' + str(v) for k, v in self._parameters.items()))
def enable_padding(self,
direction: Optional[str] = "right",

View File

@ -33,14 +33,14 @@ class BertWordPieceTokenizer(BaseTokenizer):
lowercase=lowercase)
tokenizer.pre_tokenizer = BertPreTokenizer.new()
sep_token_id = tokenizer.token_to_id(sep_token)
if sep_token_id is None:
raise TypeError("sep_token not found in the vocabulary")
cls_token_id = tokenizer.token_to_id(cls_token)
if cls_token_id is None:
raise TypeError("cls_token not found in the vocabulary")
if add_special_tokens and vocab_file is not None:
sep_token_id = tokenizer.token_to_id(sep_token)
if sep_token_id is None:
raise TypeError("sep_token not found in the vocabulary")
cls_token_id = tokenizer.token_to_id(cls_token)
if cls_token_id is None:
raise TypeError("cls_token not found in the vocabulary")
if add_special_tokens:
tokenizer.post_processor = BertProcessing.new(
(sep_token, sep_token_id),
(cls_token, cls_token_id)
@ -81,4 +81,6 @@ class BertWordPieceTokenizer(BaseTokenizer):
show_progress=show_progress,
continuing_subword_prefix=wordpieces_prefix
)
if isinstance(files, str):
files = [files]
self._tokenizer.train(trainer, files)

View File

@ -61,4 +61,6 @@ class BPETokenizer(BaseTokenizer):
end_of_word_suffix=suffix,
show_progress=show_progress
)
if isinstance(files, str):
files = [files]
self._tokenizer.train(trainer, files)

View File

@ -45,4 +45,6 @@ class ByteLevelBPETokenizer(BaseTokenizer):
special_tokens=special_tokens,
initial_alphabet=pre_tokenizers.ByteLevel.alphabet()
)
if isinstance(files, str):
files = [files]
self._tokenizer.train(trainer, files)

View File

@ -59,4 +59,6 @@ class SentencePieceBPETokenizer(BaseTokenizer):
initial_alphabet=initial_alphabet,
show_progress=show_progress
)
if isinstance(files, str):
files = [files]
self._tokenizer.train(trainer, files)