mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Merge pull request #48 from huggingface/fix-python-stuff
Fix a few python stuff
This commit is contained in:
@ -109,7 +109,7 @@ impl Encoding {
|
|||||||
impl PyObjectProtocol for Encoding {
|
impl PyObjectProtocol for Encoding {
|
||||||
fn __repr__(&self) -> PyResult<String> {
|
fn __repr__(&self) -> PyResult<String> {
|
||||||
Ok(format!(
|
Ok(format!(
|
||||||
"Encoding(num_tokens={}, attributs=[ids, type_ids, tokens, offsets, \
|
"Encoding(num_tokens={}, attributes=[ids, type_ids, tokens, offsets, \
|
||||||
attention_mask, special_tokens_mask, overflowing, original_str, normalized_str])",
|
attention_mask, special_tokens_mask, overflowing, original_str, normalized_str])",
|
||||||
self.encoding.get_ids().len()
|
self.encoding.get_ids().len()
|
||||||
))
|
))
|
||||||
|
@ -12,7 +12,7 @@ class BaseTokenizer:
|
|||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return "Tokenizer(vocabulary_size={}, {})".format(
|
return "Tokenizer(vocabulary_size={}, {})".format(
|
||||||
self._tokenizer.get_vocab_size(),
|
self._tokenizer.get_vocab_size(),
|
||||||
', '.join(k + ': ' + str(v) for k, v in self._parameters.items()))
|
', '.join(k + '=' + str(v) for k, v in self._parameters.items()))
|
||||||
|
|
||||||
def enable_padding(self,
|
def enable_padding(self,
|
||||||
direction: Optional[str] = "right",
|
direction: Optional[str] = "right",
|
||||||
|
@ -33,14 +33,14 @@ class BertWordPieceTokenizer(BaseTokenizer):
|
|||||||
lowercase=lowercase)
|
lowercase=lowercase)
|
||||||
tokenizer.pre_tokenizer = BertPreTokenizer.new()
|
tokenizer.pre_tokenizer = BertPreTokenizer.new()
|
||||||
|
|
||||||
sep_token_id = tokenizer.token_to_id(sep_token)
|
if add_special_tokens and vocab_file is not None:
|
||||||
if sep_token_id is None:
|
sep_token_id = tokenizer.token_to_id(sep_token)
|
||||||
raise TypeError("sep_token not found in the vocabulary")
|
if sep_token_id is None:
|
||||||
cls_token_id = tokenizer.token_to_id(cls_token)
|
raise TypeError("sep_token not found in the vocabulary")
|
||||||
if cls_token_id is None:
|
cls_token_id = tokenizer.token_to_id(cls_token)
|
||||||
raise TypeError("cls_token not found in the vocabulary")
|
if cls_token_id is None:
|
||||||
|
raise TypeError("cls_token not found in the vocabulary")
|
||||||
|
|
||||||
if add_special_tokens:
|
|
||||||
tokenizer.post_processor = BertProcessing.new(
|
tokenizer.post_processor = BertProcessing.new(
|
||||||
(sep_token, sep_token_id),
|
(sep_token, sep_token_id),
|
||||||
(cls_token, cls_token_id)
|
(cls_token, cls_token_id)
|
||||||
@ -81,4 +81,6 @@ class BertWordPieceTokenizer(BaseTokenizer):
|
|||||||
show_progress=show_progress,
|
show_progress=show_progress,
|
||||||
continuing_subword_prefix=wordpieces_prefix
|
continuing_subword_prefix=wordpieces_prefix
|
||||||
)
|
)
|
||||||
|
if isinstance(files, str):
|
||||||
|
files = [files]
|
||||||
self._tokenizer.train(trainer, files)
|
self._tokenizer.train(trainer, files)
|
||||||
|
@ -61,4 +61,6 @@ class BPETokenizer(BaseTokenizer):
|
|||||||
end_of_word_suffix=suffix,
|
end_of_word_suffix=suffix,
|
||||||
show_progress=show_progress
|
show_progress=show_progress
|
||||||
)
|
)
|
||||||
|
if isinstance(files, str):
|
||||||
|
files = [files]
|
||||||
self._tokenizer.train(trainer, files)
|
self._tokenizer.train(trainer, files)
|
||||||
|
@ -45,4 +45,6 @@ class ByteLevelBPETokenizer(BaseTokenizer):
|
|||||||
special_tokens=special_tokens,
|
special_tokens=special_tokens,
|
||||||
initial_alphabet=pre_tokenizers.ByteLevel.alphabet()
|
initial_alphabet=pre_tokenizers.ByteLevel.alphabet()
|
||||||
)
|
)
|
||||||
|
if isinstance(files, str):
|
||||||
|
files = [files]
|
||||||
self._tokenizer.train(trainer, files)
|
self._tokenizer.train(trainer, files)
|
||||||
|
@ -59,4 +59,6 @@ class SentencePieceBPETokenizer(BaseTokenizer):
|
|||||||
initial_alphabet=initial_alphabet,
|
initial_alphabet=initial_alphabet,
|
||||||
show_progress=show_progress
|
show_progress=show_progress
|
||||||
)
|
)
|
||||||
|
if isinstance(files, str):
|
||||||
|
files = [files]
|
||||||
self._tokenizer.train(trainer, files)
|
self._tokenizer.train(trainer, files)
|
||||||
|
Reference in New Issue
Block a user