mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Preparing rc1 release. (#1056)
* Preparing rc1 release. * Fixing test_alignment_methods * Fixing the overflowing sequence_id issue (LayoutLMv2 tests caught this). * Adding overly complex overflowing test.
This commit is contained in:
@ -1,3 +1,8 @@
|
||||
## [0.13.0]
|
||||
|
||||
- [#1008] `Decoder` is now a composable trait, but without being backward incompatible
|
||||
- [#1047, #1051, #1052] `Processor` is now a composable trait, but without being backward incompatible
|
||||
|
||||
## [0.12.1]
|
||||
|
||||
- [#938] **Reverted breaking change**. https://github.com/huggingface/transformers/issues/16520
|
||||
@ -160,6 +165,13 @@ The files must now be provided first when calling `tokenizer.train(files, traine
|
||||
- Actually add special tokens in tokenizers implementations ([acef252](https://github.com/huggingface/tokenizers/commit/acef252dacc43adc414175cfc325668ad1488753))
|
||||
|
||||
|
||||
[#956]: https://github.com/huggingface/tokenizers/pull/956
|
||||
[#1008]: https://github.com/huggingface/tokenizers/pull/1008
|
||||
[#1009]: https://github.com/huggingface/tokenizers/pull/1009
|
||||
[#1047]: https://github.com/huggingface/tokenizers/pull/1047
|
||||
[#1055]: https://github.com/huggingface/tokenizers/pull/1055
|
||||
[#1051]: https://github.com/huggingface/tokenizers/pull/1051
|
||||
[#1052]: https://github.com/huggingface/tokenizers/pull/1052
|
||||
[#938]: https://github.com/huggingface/tokenizers/pull/938
|
||||
[#939]: https://github.com/huggingface/tokenizers/pull/939
|
||||
[#952]: https://github.com/huggingface/tokenizers/pull/952
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "tokenizers",
|
||||
"version": "0.12.1",
|
||||
"version": "0.13.0",
|
||||
"description": "",
|
||||
"main": "./dist/index.js",
|
||||
"types": "./dist/index.d.ts",
|
||||
|
@ -4,6 +4,16 @@ All notable changes to this project will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [0.13.0]
|
||||
|
||||
- [#956] PyO3 version upgrade
|
||||
- [#1055] M1 automated builds
|
||||
- [#1008] `Decoder` is now a composable trait, but without being backward incompatible
|
||||
- [#1047, #1051, #1052] `Processor` is now a composable trait, but without being backward incompatible
|
||||
|
||||
Both trait changes warrant a "major" number since, despite best efforts to not break backward
|
||||
compatibility, the code is different enough that we cannot be exactly sure.
|
||||
|
||||
## [0.12.1]
|
||||
|
||||
- [#938] **Reverted breaking change**. https://github.com/huggingface/transformers/issues/16520
|
||||
@ -376,6 +386,13 @@ delimiter (Works like `.split(delimiter)`)
|
||||
- Fix a bug that was causing crashes in Python 3.5
|
||||
|
||||
|
||||
[#956]: https://github.com/huggingface/tokenizers/pull/956
|
||||
[#1008]: https://github.com/huggingface/tokenizers/pull/1008
|
||||
[#1009]: https://github.com/huggingface/tokenizers/pull/1009
|
||||
[#1047]: https://github.com/huggingface/tokenizers/pull/1047
|
||||
[#1055]: https://github.com/huggingface/tokenizers/pull/1055
|
||||
[#1051]: https://github.com/huggingface/tokenizers/pull/1051
|
||||
[#1052]: https://github.com/huggingface/tokenizers/pull/1052
|
||||
[#938]: https://github.com/huggingface/tokenizers/pull/938
|
||||
[#939]: https://github.com/huggingface/tokenizers/pull/939
|
||||
[#952]: https://github.com/huggingface/tokenizers/pull/952
|
||||
|
2
bindings/python/Cargo.lock
generated
2
bindings/python/Cargo.lock
generated
@ -1700,7 +1700,7 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
|
||||
|
||||
[[package]]
|
||||
name = "tokenizers"
|
||||
version = "0.12.1"
|
||||
version = "0.13.0"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"cached-path",
|
||||
|
@ -1,4 +1,4 @@
|
||||
__version__ = "0.12.1.dev0"
|
||||
__version__ = "0.13.0.dev0"
|
||||
|
||||
from typing import Tuple, Union, Tuple, List
|
||||
from enum import Enum
|
||||
|
@ -8,7 +8,7 @@ extras["dev"] = extras["testing"]
|
||||
|
||||
setup(
|
||||
name="tokenizers",
|
||||
version="0.12.1.dev0",
|
||||
version="0.13.0.dev0",
|
||||
description="Fast and Customizable Tokenizers",
|
||||
long_description=open("README.md", "r", encoding="utf-8").read(),
|
||||
long_description_content_type="text/markdown",
|
||||
|
@ -4,6 +4,15 @@ All notable changes to this project will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [0.13.0]
|
||||
|
||||
- [#1009] `unstable_wasm` feature to support building on Wasm (it's unstable !)
|
||||
- [#1008] `Decoder` is now a composable trait, but without being backward incompatible
|
||||
- [#1047, #1051, #1052] `Processor` is now a composable trait, but without being backward incompatible
|
||||
|
||||
Both trait changes warrant a "major" number since, despite best efforts to not break backward
|
||||
compatibility, the code is different enough that we cannot be exactly sure.
|
||||
|
||||
## [0.12.1]
|
||||
|
||||
- [#938] **Reverted breaking change**. https://github.com/huggingface/transformers/issues/16520
|
||||
@ -157,6 +166,13 @@ split up in multiple bytes
|
||||
- [#174]: The `LongestFirst` truncation strategy had a bug
|
||||
|
||||
|
||||
[#956]: https://github.com/huggingface/tokenizers/pull/956
|
||||
[#1008]: https://github.com/huggingface/tokenizers/pull/1008
|
||||
[#1009]: https://github.com/huggingface/tokenizers/pull/1009
|
||||
[#1047]: https://github.com/huggingface/tokenizers/pull/1047
|
||||
[#1055]: https://github.com/huggingface/tokenizers/pull/1055
|
||||
[#1051]: https://github.com/huggingface/tokenizers/pull/1051
|
||||
[#1052]: https://github.com/huggingface/tokenizers/pull/1052
|
||||
[#938]: https://github.com/huggingface/tokenizers/pull/938
|
||||
[#939]: https://github.com/huggingface/tokenizers/pull/939
|
||||
[#952]: https://github.com/huggingface/tokenizers/pull/952
|
||||
|
@ -2,7 +2,7 @@
|
||||
authors = ["Anthony MOI <m.anthony.moi@gmail.com>"]
|
||||
edition = "2018"
|
||||
name = "tokenizers"
|
||||
version = "0.12.1"
|
||||
version = "0.13.0"
|
||||
homepage = "https://github.com/huggingface/tokenizers"
|
||||
repository = "https://github.com/huggingface/tokenizers"
|
||||
documentation = "https://docs.rs/tokenizers/"
|
||||
|
@ -484,7 +484,7 @@ mod tests {
|
||||
);
|
||||
let expected = Encoding::new(
|
||||
vec![0; 5],
|
||||
vec![],
|
||||
vec![0; 5],
|
||||
vec![
|
||||
"Ġ".into(),
|
||||
"ĠĠĠĠHelloĠĠ".into(),
|
||||
@ -508,7 +508,7 @@ mod tests {
|
||||
|
||||
let pair_expected = Encoding::new(
|
||||
vec![0; 10],
|
||||
vec![],
|
||||
vec![0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
|
||||
vec![
|
||||
"Ġ".into(),
|
||||
"ĠĠĠĠHelloĠĠ".into(),
|
||||
|
@ -195,4 +195,91 @@ mod tests {
|
||||
bert
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bert_processing() {
|
||||
let processor = BertProcessing::default();
|
||||
assert_eq!(processor.added_tokens(false), 2);
|
||||
assert_eq!(processor.added_tokens(true), 3);
|
||||
|
||||
use crate::Token;
|
||||
let encoding = Encoding::from_tokens(
|
||||
vec![
|
||||
Token::new(12, "Hello".into(), (0, 5)),
|
||||
Token::new(14, "there".into(), (6, 11)),
|
||||
],
|
||||
0,
|
||||
);
|
||||
let pair = Encoding::from_tokens(vec![Token::new(15, "pair".into(), (0, 4))], 0);
|
||||
let single_encoding = processor.process(encoding.clone(), None, true).unwrap();
|
||||
assert_eq!(
|
||||
single_encoding,
|
||||
Encoding::new(
|
||||
vec![101, 12, 14, 102],
|
||||
vec![0, 0, 0, 0],
|
||||
vec![
|
||||
"[CLS]".into(),
|
||||
"Hello".into(),
|
||||
"there".into(),
|
||||
"[SEP]".into()
|
||||
],
|
||||
vec![None, None, None, None],
|
||||
vec![(0, 0), (0, 5), (6, 11), (0, 0)],
|
||||
vec![1, 0, 0, 1],
|
||||
vec![1, 1, 1, 1],
|
||||
vec![],
|
||||
HashMap::from_iter(vec![(0, 1..3)]),
|
||||
)
|
||||
);
|
||||
assert_eq!(single_encoding.token_to_sequence(2), Some(0));
|
||||
assert_eq!(single_encoding.token_to_sequence(3), None);
|
||||
let pair_encoding = processor
|
||||
.process(encoding.clone(), Some(pair.clone()), true)
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
pair_encoding,
|
||||
Encoding::new(
|
||||
vec![101, 12, 14, 102, 15, 102],
|
||||
vec![0, 0, 0, 0, 1, 1],
|
||||
vec![
|
||||
"[CLS]".into(),
|
||||
"Hello".into(),
|
||||
"there".into(),
|
||||
"[SEP]".into(),
|
||||
"pair".into(),
|
||||
"[SEP]".into()
|
||||
],
|
||||
vec![None, None, None, None, None, None],
|
||||
vec![(0, 0), (0, 5), (6, 11), (0, 0), (0, 4), (0, 0)],
|
||||
vec![1, 0, 0, 1, 0, 1],
|
||||
vec![1, 1, 1, 1, 1, 1],
|
||||
vec![],
|
||||
HashMap::from_iter(vec![(0, 1..3), (1, 4..5)]),
|
||||
)
|
||||
);
|
||||
assert_eq!(pair_encoding.token_to_sequence(2), Some(0));
|
||||
assert_eq!(pair_encoding.token_to_sequence(3), None);
|
||||
assert_eq!(pair_encoding.token_to_sequence(4), Some(1));
|
||||
assert_eq!(pair_encoding.token_to_sequence(5), None);
|
||||
|
||||
// No special tokens
|
||||
let pair_encoding = processor.process(encoding, Some(pair), false).unwrap();
|
||||
assert_eq!(
|
||||
pair_encoding,
|
||||
Encoding::new(
|
||||
vec![12, 14, 15],
|
||||
vec![0, 0, 1],
|
||||
vec!["Hello".into(), "there".into(), "pair".into(),],
|
||||
vec![None, None, None],
|
||||
vec![(0, 5), (6, 11), (0, 4)],
|
||||
vec![0, 0, 0],
|
||||
vec![1, 1, 1],
|
||||
vec![],
|
||||
HashMap::from_iter(vec![(0, 0..2), (1, 2..3)]),
|
||||
)
|
||||
);
|
||||
assert_eq!(pair_encoding.token_to_sequence(0), Some(0));
|
||||
assert_eq!(pair_encoding.token_to_sequence(1), Some(0));
|
||||
assert_eq!(pair_encoding.token_to_sequence(2), Some(1));
|
||||
}
|
||||
}
|
||||
|
@ -146,7 +146,7 @@ impl PostProcessor for RobertaProcessing {
|
||||
)
|
||||
} else {
|
||||
let pair_ids = [&[self.sep.1], encoding.get_ids(), &[self.sep.1]].concat();
|
||||
let pair_type_ids = vec![0; encoding.get_ids().len() + 2];
|
||||
let pair_type_ids = vec![1; encoding.get_ids().len() + 2];
|
||||
let pair_tokens = [
|
||||
&[self.sep.0.clone()],
|
||||
encoding.get_tokens(),
|
||||
@ -176,7 +176,7 @@ impl PostProcessor for RobertaProcessing {
|
||||
.map(|encoding| {
|
||||
let pair_ids =
|
||||
[&[self.sep.1], encoding.get_ids(), &[self.sep.1]].concat();
|
||||
let pair_type_ids = vec![0; encoding.get_ids().len() + 2];
|
||||
let pair_type_ids = vec![1; encoding.get_ids().len() + 2];
|
||||
let pair_tokens = [
|
||||
&[self.sep.0.clone()],
|
||||
encoding.get_tokens(),
|
||||
@ -240,4 +240,88 @@ mod tests {
|
||||
roberta
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn roberta_processing() {
|
||||
let processor = RobertaProcessing::default();
|
||||
assert_eq!(processor.added_tokens(false), 2);
|
||||
assert_eq!(processor.added_tokens(true), 4);
|
||||
|
||||
use crate::Token;
|
||||
let encoding = Encoding::from_tokens(
|
||||
vec![
|
||||
Token::new(12, "Hello".into(), (0, 5)),
|
||||
Token::new(14, "there".into(), (6, 11)),
|
||||
],
|
||||
0,
|
||||
);
|
||||
let pair = Encoding::from_tokens(vec![Token::new(15, "pair".into(), (0, 4))], 0);
|
||||
let single_encoding = processor.process(encoding.clone(), None, true).unwrap();
|
||||
assert_eq!(
|
||||
single_encoding,
|
||||
Encoding::new(
|
||||
vec![0, 12, 14, 2],
|
||||
vec![0, 0, 0, 0],
|
||||
vec!["<s>".into(), "Hello".into(), "there".into(), "</s>".into()],
|
||||
vec![None, None, None, None],
|
||||
vec![(0, 0), (0, 5), (6, 11), (0, 0)],
|
||||
vec![1, 0, 0, 1],
|
||||
vec![1, 1, 1, 1],
|
||||
vec![],
|
||||
HashMap::from_iter(vec![(0, 1..3)]),
|
||||
)
|
||||
);
|
||||
assert_eq!(single_encoding.token_to_sequence(2), Some(0));
|
||||
assert_eq!(single_encoding.token_to_sequence(3), None);
|
||||
let pair_encoding = processor
|
||||
.process(encoding.clone(), Some(pair.clone()), true)
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
pair_encoding,
|
||||
Encoding::new(
|
||||
vec![0, 12, 14, 2, 2, 15, 2],
|
||||
vec![0, 0, 0, 0, 1, 1, 1],
|
||||
vec![
|
||||
"<s>".into(),
|
||||
"Hello".into(),
|
||||
"there".into(),
|
||||
"</s>".into(),
|
||||
"</s>".into(),
|
||||
"pair".into(),
|
||||
"</s>".into()
|
||||
],
|
||||
vec![None, None, None, None, None, None, None],
|
||||
vec![(0, 0), (0, 5), (6, 11), (0, 0), (0, 0), (0, 4), (0, 0)],
|
||||
vec![1, 0, 0, 1, 1, 0, 1],
|
||||
vec![1, 1, 1, 1, 1, 1, 1],
|
||||
vec![],
|
||||
HashMap::from_iter(vec![(0, 1..3), (1, 5..6)]),
|
||||
)
|
||||
);
|
||||
assert_eq!(pair_encoding.token_to_sequence(2), Some(0));
|
||||
assert_eq!(pair_encoding.token_to_sequence(3), None);
|
||||
assert_eq!(pair_encoding.token_to_sequence(4), None);
|
||||
assert_eq!(pair_encoding.token_to_sequence(5), Some(1));
|
||||
assert_eq!(pair_encoding.token_to_sequence(6), None);
|
||||
|
||||
// No special tokens
|
||||
let pair_encoding = processor.process(encoding, Some(pair), false).unwrap();
|
||||
assert_eq!(
|
||||
pair_encoding,
|
||||
Encoding::new(
|
||||
vec![12, 14, 15],
|
||||
vec![0, 0, 1],
|
||||
vec!["Hello".into(), "there".into(), "pair".into(),],
|
||||
vec![None, None, None],
|
||||
vec![(0, 5), (6, 11), (0, 4)],
|
||||
vec![0, 0, 0],
|
||||
vec![1, 1, 1],
|
||||
vec![],
|
||||
HashMap::from_iter(vec![(0, 0..2), (1, 2..3)]),
|
||||
)
|
||||
);
|
||||
assert_eq!(pair_encoding.token_to_sequence(0), Some(0));
|
||||
assert_eq!(pair_encoding.token_to_sequence(1), Some(0));
|
||||
assert_eq!(pair_encoding.token_to_sequence(2), Some(1));
|
||||
}
|
||||
}
|
||||
|
@ -47,7 +47,7 @@ mod tests {
|
||||
fn process_chain() {
|
||||
let start = Encoding::new(
|
||||
vec![0; 5],
|
||||
vec![],
|
||||
vec![0; 5],
|
||||
vec![
|
||||
"Ġ".into(),
|
||||
"ĠĠĠĠHelloĠĠ".into(),
|
||||
@ -67,7 +67,7 @@ mod tests {
|
||||
let sequence = Sequence::new(vec![PostProcessorWrapper::ByteLevel(bytelevel)]);
|
||||
let expected = Encoding::new(
|
||||
vec![0; 5],
|
||||
vec![],
|
||||
vec![0; 5],
|
||||
vec![
|
||||
"Ġ".into(),
|
||||
"ĠĠĠĠHelloĠĠ".into(),
|
||||
@ -94,7 +94,7 @@ mod tests {
|
||||
|
||||
let pair_expected = Encoding::new(
|
||||
vec![0; 10],
|
||||
vec![],
|
||||
vec![0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
|
||||
vec![
|
||||
"Ġ".into(),
|
||||
"ĠĠĠĠHelloĠĠ".into(),
|
||||
|
@ -885,6 +885,182 @@ mod tests {
|
||||
assert_eq!(pair_encoding.token_to_sequence(5), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn template_processing_overflowing() {
|
||||
let processor = tests::get_bert_template();
|
||||
assert_eq!(processor.added_tokens(false), 2);
|
||||
assert_eq!(processor.added_tokens(true), 3);
|
||||
|
||||
use crate::Token;
|
||||
let mut encoding = Encoding::from_tokens(
|
||||
vec![
|
||||
Token::new(12, "Hello".into(), (0, 5)),
|
||||
Token::new(14, "there".into(), (6, 11)),
|
||||
],
|
||||
0,
|
||||
);
|
||||
let overflowing = Encoding::from_tokens(vec![Token::new(13, "you".into(), (12, 15))], 0);
|
||||
encoding.set_overflowing(vec![overflowing]);
|
||||
|
||||
let mut pair = Encoding::from_tokens(
|
||||
vec![
|
||||
Token::new(15, "pair".into(), (0, 4)),
|
||||
Token::new(16, "with".into(), (5, 9)),
|
||||
],
|
||||
0,
|
||||
);
|
||||
let pair_overflowing =
|
||||
Encoding::from_tokens(vec![Token::new(17, "info".into(), (10, 14))], 0);
|
||||
pair.set_overflowing(vec![pair_overflowing]);
|
||||
|
||||
let single_encoding = processor.process(encoding.clone(), None, true).unwrap();
|
||||
assert_eq!(
|
||||
single_encoding,
|
||||
Encoding::new(
|
||||
vec![1, 12, 14, 0],
|
||||
vec![0, 0, 0, 0],
|
||||
vec![
|
||||
"[CLS]".into(),
|
||||
"Hello".into(),
|
||||
"there".into(),
|
||||
"[SEP]".into()
|
||||
],
|
||||
vec![None, None, None, None],
|
||||
vec![(0, 0), (0, 5), (6, 11), (0, 0)],
|
||||
vec![1, 0, 0, 1],
|
||||
vec![1, 1, 1, 1],
|
||||
vec![Encoding::new(
|
||||
vec![1, 13, 0],
|
||||
vec![0, 0, 0],
|
||||
vec!["[CLS]".into(), "you".into(), "[SEP]".into()],
|
||||
vec![None, None, None],
|
||||
vec![(0, 0), (12, 15), (0, 0)],
|
||||
vec![1, 0, 1],
|
||||
vec![1, 1, 1],
|
||||
vec![],
|
||||
HashMap::from_iter(vec![(0, 1..2)]),
|
||||
)],
|
||||
HashMap::from_iter(vec![(0, 1..3)]),
|
||||
)
|
||||
);
|
||||
assert_eq!(single_encoding.token_to_sequence(2), Some(0));
|
||||
assert_eq!(single_encoding.token_to_sequence(3), None);
|
||||
let pair_encoding = processor.process(encoding, Some(pair), true).unwrap();
|
||||
println!("{pair_encoding:#?}");
|
||||
assert_eq!(
|
||||
pair_encoding,
|
||||
Encoding::new(
|
||||
vec![1, 12, 14, 0, 15, 16, 0],
|
||||
vec![0, 0, 0, 0, 1, 1, 1],
|
||||
vec![
|
||||
"[CLS]".into(),
|
||||
"Hello".into(),
|
||||
"there".into(),
|
||||
"[SEP]".into(),
|
||||
"pair".into(),
|
||||
"with".into(),
|
||||
"[SEP]".into()
|
||||
],
|
||||
vec![None, None, None, None, None, None, None],
|
||||
vec![(0, 0), (0, 5), (6, 11), (0, 0), (0, 4), (5, 9), (0, 0)],
|
||||
vec![1, 0, 0, 1, 0, 0, 1],
|
||||
vec![1, 1, 1, 1, 1, 1, 1],
|
||||
vec![
|
||||
Encoding::new(
|
||||
vec![1, 13, 0, 15, 16, 0],
|
||||
vec![0, 0, 0, 1, 1, 1],
|
||||
vec![
|
||||
"[CLS]".into(),
|
||||
"you".into(),
|
||||
"[SEP]".into(),
|
||||
"pair".into(),
|
||||
"with".into(),
|
||||
"[SEP]".into()
|
||||
],
|
||||
vec![None, None, None, None, None, None],
|
||||
vec![(0, 0), (12, 15), (0, 0), (0, 4), (5, 9), (0, 0)],
|
||||
vec![1, 0, 1, 0, 0, 1],
|
||||
vec![1, 1, 1, 1, 1, 1],
|
||||
vec![Encoding::new(
|
||||
vec![1, 13, 0, 17, 0],
|
||||
vec![0, 0, 0, 0, 1],
|
||||
vec![
|
||||
"[CLS]".into(),
|
||||
"you".into(),
|
||||
"[SEP]".into(),
|
||||
"info".into(),
|
||||
"[SEP]".into()
|
||||
],
|
||||
vec![None, None, None, None, None,],
|
||||
vec![(0, 0), (12, 15), (0, 0), (10, 14), (0, 0)],
|
||||
vec![1, 0, 1, 0, 1],
|
||||
vec![1, 1, 1, 1, 1],
|
||||
vec![],
|
||||
HashMap::from_iter(vec![(0, 1..2), (1, 3..4)]),
|
||||
),],
|
||||
HashMap::from_iter(vec![(1, 3..5), (0, 1..2)]),
|
||||
),
|
||||
Encoding::new(
|
||||
vec![1, 13, 0, 17, 0],
|
||||
vec![0, 0, 0, 0, 1],
|
||||
vec![
|
||||
"[CLS]".into(),
|
||||
"you".into(),
|
||||
"[SEP]".into(),
|
||||
"info".into(),
|
||||
"[SEP]".into()
|
||||
],
|
||||
vec![None, None, None, None, None,],
|
||||
vec![(0, 0), (12, 15), (0, 0), (10, 14), (0, 0)],
|
||||
vec![1, 0, 1, 0, 1],
|
||||
vec![1, 1, 1, 1, 1],
|
||||
vec![],
|
||||
HashMap::from_iter(vec![(0, 1..2), (1, 3..4)]),
|
||||
),
|
||||
Encoding::new(
|
||||
vec![1, 12, 14, 0, 17, 0],
|
||||
vec![0, 0, 0, 0, 0, 1],
|
||||
vec![
|
||||
"[CLS]".into(),
|
||||
"Hello".into(),
|
||||
"there".into(),
|
||||
"[SEP]".into(),
|
||||
"info".into(),
|
||||
"[SEP]".into()
|
||||
],
|
||||
vec![None, None, None, None, None, None],
|
||||
vec![(0, 0), (0, 5), (6, 11), (0, 0), (10, 14), (0, 0)],
|
||||
vec![1, 0, 0, 1, 0, 1],
|
||||
vec![1, 1, 1, 1, 1, 1],
|
||||
vec![Encoding::new(
|
||||
vec![1, 13, 0, 17, 0],
|
||||
vec![0, 0, 0, 0, 1],
|
||||
vec![
|
||||
"[CLS]".into(),
|
||||
"you".into(),
|
||||
"[SEP]".into(),
|
||||
"info".into(),
|
||||
"[SEP]".into()
|
||||
],
|
||||
vec![None, None, None, None, None,],
|
||||
vec![(0, 0), (12, 15), (0, 0), (10, 14), (0, 0)],
|
||||
vec![1, 0, 1, 0, 1],
|
||||
vec![1, 1, 1, 1, 1],
|
||||
vec![],
|
||||
HashMap::from_iter(vec![(0, 1..2), (1, 3..4)]),
|
||||
),],
|
||||
HashMap::from_iter(vec![(0, 1..3), (1, 4..5)]),
|
||||
)
|
||||
],
|
||||
HashMap::from_iter(vec![(0, 1..3), (1, 4..6)]),
|
||||
)
|
||||
);
|
||||
assert_eq!(pair_encoding.token_to_sequence(2), Some(0));
|
||||
assert_eq!(pair_encoding.token_to_sequence(3), None);
|
||||
assert_eq!(pair_encoding.token_to_sequence(4), Some(1));
|
||||
assert_eq!(pair_encoding.token_to_sequence(5), Some(1));
|
||||
assert_eq!(pair_encoding.token_to_sequence(6), None);
|
||||
}
|
||||
#[test]
|
||||
fn pair_must_use_both_sequences() {
|
||||
let processor = TemplateProcessing::builder()
|
||||
|
@ -176,6 +176,10 @@ impl Encoding {
|
||||
&self.overflowing
|
||||
}
|
||||
|
||||
pub fn set_overflowing(&mut self, overflowing: Vec<Encoding>) {
|
||||
self.overflowing = overflowing;
|
||||
}
|
||||
|
||||
pub fn get_overflowing_mut(&mut self) -> &mut Vec<Encoding> {
|
||||
&mut self.overflowing
|
||||
}
|
||||
|
@ -100,14 +100,21 @@ pub trait PostProcessor {
|
||||
pair_encoding: Option<Encoding>,
|
||||
add_special_tokens: bool,
|
||||
) -> Result<Encoding> {
|
||||
let encodings = if let Some(pair_encoding) = pair_encoding {
|
||||
let mut encodings = if let Some(pair_encoding) = pair_encoding {
|
||||
vec![encoding, pair_encoding]
|
||||
} else {
|
||||
vec![encoding]
|
||||
};
|
||||
encodings.iter_mut().enumerate().for_each(|(i, encoding)| {
|
||||
encoding.set_sequence_id(i);
|
||||
encoding
|
||||
.get_overflowing_mut()
|
||||
.iter_mut()
|
||||
.for_each(|encoding| encoding.set_sequence_id(i));
|
||||
encoding.set_type_ids(vec![i as u32; encoding.len()]);
|
||||
});
|
||||
|
||||
let encodings = self.process_encodings(encodings, add_special_tokens)?;
|
||||
|
||||
Ok(Encoding::merge(encodings, false))
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user