mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-03 11:18:29 +00:00
make tests happy
This commit is contained in:
@@ -89,6 +89,7 @@ from .tokenizers import (
|
||||
pre_tokenizers,
|
||||
processors,
|
||||
trainers,
|
||||
__version__,
|
||||
)
|
||||
from .implementations import (
|
||||
BertWordPieceTokenizer,
|
||||
|
||||
@@ -75,7 +75,7 @@ impl PyAddedToken {
|
||||
single_word: None,
|
||||
lstrip: None,
|
||||
rstrip: None,
|
||||
normalized: None,
|
||||
normalized: Some(!special.unwrap_or(true)),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -226,7 +226,7 @@ impl PyBpeTrainer {
|
||||
if let Ok(content) = token.extract::<String>() {
|
||||
Ok(tk::tokenizer::AddedToken::from(content, true))
|
||||
} else if let Ok(mut token) = token.extract::<PyRefMut<PyAddedToken>>() {
|
||||
token.special = false;
|
||||
token.special = true;
|
||||
Ok(token.get_token())
|
||||
} else {
|
||||
Err(exceptions::PyTypeError::new_err(
|
||||
|
||||
@@ -24,7 +24,7 @@ class TestAddedToken:
|
||||
assert added_token.special == False
|
||||
added_token.special = True
|
||||
assert added_token.special == True
|
||||
|
||||
added_token.special = False
|
||||
assert str(added_token) == "<mask>"
|
||||
assert (
|
||||
repr(added_token)
|
||||
|
||||
@@ -34,8 +34,8 @@ class TestBpeTrainer:
|
||||
assert trainer.min_frequency == 12
|
||||
assert trainer.show_progress == False
|
||||
assert trainer.special_tokens == [
|
||||
AddedToken("1"),
|
||||
AddedToken("2"),
|
||||
AddedToken("1", special = True),
|
||||
AddedToken("2", special = True),
|
||||
]
|
||||
assert trainer.limit_alphabet == 13
|
||||
assert sorted(trainer.initial_alphabet) == ["a", "b", "c"]
|
||||
@@ -91,8 +91,8 @@ class TestWordPieceTrainer:
|
||||
assert trainer.min_frequency == 12
|
||||
assert trainer.show_progress == False
|
||||
assert trainer.special_tokens == [
|
||||
AddedToken("1"),
|
||||
AddedToken("2"),
|
||||
AddedToken("1", special = True),
|
||||
AddedToken("2", special = True),
|
||||
]
|
||||
assert trainer.limit_alphabet == 13
|
||||
assert sorted(trainer.initial_alphabet) == ["a", "b", "c"]
|
||||
@@ -131,8 +131,8 @@ class TestWordLevelTrainer:
|
||||
assert trainer.min_frequency == 12
|
||||
assert trainer.show_progress == False
|
||||
assert trainer.special_tokens == [
|
||||
AddedToken("1"),
|
||||
AddedToken("2"),
|
||||
AddedToken("1", special = True),
|
||||
AddedToken("2", special = True),
|
||||
]
|
||||
|
||||
# Modify these
|
||||
@@ -272,8 +272,8 @@ class TestUnigram:
|
||||
assert trainer.vocab_size == 12345
|
||||
assert trainer.show_progress == False
|
||||
assert trainer.special_tokens == [
|
||||
AddedToken("1", normalized=False),
|
||||
AddedToken("2", lstrip=True, normalized=False),
|
||||
AddedToken("1", normalized=False, special = True),
|
||||
AddedToken("2", lstrip=True, normalized=False, special = True),
|
||||
]
|
||||
assert sorted(trainer.initial_alphabet) == ["a", "b", "c"]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user