mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Fixing roberta type id (everything is zero). (#1072)
* Fixing roberta type ids (everything is zero). * We need to fix type_ids for all sequence even when not changing anything else. * Fixing tests hopefully better.
This commit is contained in:
4
bindings/python/Cargo.lock
generated
4
bindings/python/Cargo.lock
generated
@ -1706,7 +1706,7 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
|
||||
|
||||
[[package]]
|
||||
name = "tokenizers"
|
||||
version = "0.13.0"
|
||||
version = "0.13.1"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"cached-path",
|
||||
@ -1739,7 +1739,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tokenizers-python"
|
||||
version = "0.13.0"
|
||||
version = "0.13.1"
|
||||
dependencies = [
|
||||
"env_logger",
|
||||
"itertools 0.9.0",
|
||||
|
@ -14,7 +14,7 @@ serde = { version = "1.0", features = [ "rc", "derive" ]}
|
||||
serde_json = "1.0"
|
||||
libc = "0.2"
|
||||
env_logger = "0.7.1"
|
||||
pyo3 = { version = "0.16.2", features = ["extension-module"] }
|
||||
pyo3 = { version = "0.16.2" }
|
||||
numpy = "0.16.2"
|
||||
ndarray = "0.13"
|
||||
onig = { version = "6.0", default-features = false }
|
||||
@ -28,5 +28,6 @@ path = "../../tokenizers"
|
||||
tempfile = "3.1"
|
||||
|
||||
[features]
|
||||
default = ["pyo3/extension-module"]
|
||||
test = ["pyo3/auto-initialize"]
|
||||
|
||||
|
@ -20,7 +20,7 @@ TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json
|
||||
test: $(TESTS_RESOURCES)
|
||||
pip install pytest requests setuptools_rust numpy pyarrow datasets
|
||||
python -m pytest -s -v tests
|
||||
cargo test --features test
|
||||
cargo test --no-default-features --features test
|
||||
|
||||
$(DATA_DIR)/big.txt :
|
||||
$(dir_guard)
|
||||
|
@ -70,6 +70,11 @@ impl PostProcessor for RobertaProcessing {
|
||||
}
|
||||
}
|
||||
|
||||
// Roberta is weird, and every encoding is type_id=0.
|
||||
encodings
|
||||
.iter_mut()
|
||||
.for_each(|encoding| encoding.set_type_ids(vec![0; encoding.len()]));
|
||||
|
||||
if !add_special_tokens {
|
||||
return Ok(encodings);
|
||||
}
|
||||
@ -110,7 +115,7 @@ impl PostProcessor for RobertaProcessing {
|
||||
.map(|encoding| {
|
||||
let ids =
|
||||
[&[self.cls.1], encoding.get_ids(), &[self.sep.1]].concat();
|
||||
let type_ids = [&[0], encoding.get_type_ids(), &[0]].concat();
|
||||
let type_ids = vec![0; encoding.get_ids().len() + 2];
|
||||
let tokens = [
|
||||
&[self.cls.0.clone()],
|
||||
encoding.get_tokens(),
|
||||
@ -146,7 +151,7 @@ impl PostProcessor for RobertaProcessing {
|
||||
)
|
||||
} else {
|
||||
let pair_ids = [&[self.sep.1], encoding.get_ids(), &[self.sep.1]].concat();
|
||||
let pair_type_ids = vec![1; encoding.get_ids().len() + 2];
|
||||
let pair_type_ids = vec![0; encoding.get_ids().len() + 2];
|
||||
let pair_tokens = [
|
||||
&[self.sep.0.clone()],
|
||||
encoding.get_tokens(),
|
||||
@ -176,7 +181,7 @@ impl PostProcessor for RobertaProcessing {
|
||||
.map(|encoding| {
|
||||
let pair_ids =
|
||||
[&[self.sep.1], encoding.get_ids(), &[self.sep.1]].concat();
|
||||
let pair_type_ids = vec![1; encoding.get_ids().len() + 2];
|
||||
let pair_type_ids = vec![0; encoding.get_ids().len() + 2];
|
||||
let pair_tokens = [
|
||||
&[self.sep.0.clone()],
|
||||
encoding.get_tokens(),
|
||||
@ -280,7 +285,7 @@ mod tests {
|
||||
pair_encoding,
|
||||
Encoding::new(
|
||||
vec![0, 12, 14, 2, 2, 15, 2],
|
||||
vec![0, 0, 0, 0, 1, 1, 1],
|
||||
vec![0, 0, 0, 0, 0, 0, 0],
|
||||
vec![
|
||||
"<s>".into(),
|
||||
"Hello".into(),
|
||||
@ -310,7 +315,7 @@ mod tests {
|
||||
pair_encoding,
|
||||
Encoding::new(
|
||||
vec![12, 14, 15],
|
||||
vec![0, 0, 1],
|
||||
vec![0, 0, 0],
|
||||
vec!["Hello".into(), "there".into(), "pair".into(),],
|
||||
vec![None, None, None],
|
||||
vec![(0, 5), (6, 11), (0, 4)],
|
||||
|
Reference in New Issue
Block a user