mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Merge pull request #48 from huggingface/fix-python-stuff
Fix a few python stuff
This commit is contained in:
@ -109,7 +109,7 @@ impl Encoding {
|
||||
impl PyObjectProtocol for Encoding {
|
||||
fn __repr__(&self) -> PyResult<String> {
|
||||
Ok(format!(
|
||||
"Encoding(num_tokens={}, attributs=[ids, type_ids, tokens, offsets, \
|
||||
"Encoding(num_tokens={}, attributes=[ids, type_ids, tokens, offsets, \
|
||||
attention_mask, special_tokens_mask, overflowing, original_str, normalized_str])",
|
||||
self.encoding.get_ids().len()
|
||||
))
|
||||
|
@ -12,7 +12,7 @@ class BaseTokenizer:
|
||||
def __repr__(self):
|
||||
return "Tokenizer(vocabulary_size={}, {})".format(
|
||||
self._tokenizer.get_vocab_size(),
|
||||
', '.join(k + ': ' + str(v) for k, v in self._parameters.items()))
|
||||
', '.join(k + '=' + str(v) for k, v in self._parameters.items()))
|
||||
|
||||
def enable_padding(self,
|
||||
direction: Optional[str] = "right",
|
||||
|
@ -33,14 +33,14 @@ class BertWordPieceTokenizer(BaseTokenizer):
|
||||
lowercase=lowercase)
|
||||
tokenizer.pre_tokenizer = BertPreTokenizer.new()
|
||||
|
||||
sep_token_id = tokenizer.token_to_id(sep_token)
|
||||
if sep_token_id is None:
|
||||
raise TypeError("sep_token not found in the vocabulary")
|
||||
cls_token_id = tokenizer.token_to_id(cls_token)
|
||||
if cls_token_id is None:
|
||||
raise TypeError("cls_token not found in the vocabulary")
|
||||
if add_special_tokens and vocab_file is not None:
|
||||
sep_token_id = tokenizer.token_to_id(sep_token)
|
||||
if sep_token_id is None:
|
||||
raise TypeError("sep_token not found in the vocabulary")
|
||||
cls_token_id = tokenizer.token_to_id(cls_token)
|
||||
if cls_token_id is None:
|
||||
raise TypeError("cls_token not found in the vocabulary")
|
||||
|
||||
if add_special_tokens:
|
||||
tokenizer.post_processor = BertProcessing.new(
|
||||
(sep_token, sep_token_id),
|
||||
(cls_token, cls_token_id)
|
||||
@ -81,4 +81,6 @@ class BertWordPieceTokenizer(BaseTokenizer):
|
||||
show_progress=show_progress,
|
||||
continuing_subword_prefix=wordpieces_prefix
|
||||
)
|
||||
if isinstance(files, str):
|
||||
files = [files]
|
||||
self._tokenizer.train(trainer, files)
|
||||
|
@ -61,4 +61,6 @@ class BPETokenizer(BaseTokenizer):
|
||||
end_of_word_suffix=suffix,
|
||||
show_progress=show_progress
|
||||
)
|
||||
if isinstance(files, str):
|
||||
files = [files]
|
||||
self._tokenizer.train(trainer, files)
|
||||
|
@ -45,4 +45,6 @@ class ByteLevelBPETokenizer(BaseTokenizer):
|
||||
special_tokens=special_tokens,
|
||||
initial_alphabet=pre_tokenizers.ByteLevel.alphabet()
|
||||
)
|
||||
if isinstance(files, str):
|
||||
files = [files]
|
||||
self._tokenizer.train(trainer, files)
|
||||
|
@ -59,4 +59,6 @@ class SentencePieceBPETokenizer(BaseTokenizer):
|
||||
initial_alphabet=initial_alphabet,
|
||||
show_progress=show_progress
|
||||
)
|
||||
if isinstance(files, str):
|
||||
files = [files]
|
||||
self._tokenizer.train(trainer, files)
|
||||
|
Reference in New Issue
Block a user