diff --git a/bindings/node/lib/bindings/tokenizer.d.ts b/bindings/node/lib/bindings/tokenizer.d.ts index f0db31ca..cd4a5185 100644 --- a/bindings/node/lib/bindings/tokenizer.d.ts +++ b/bindings/node/lib/bindings/tokenizer.d.ts @@ -392,6 +392,14 @@ export interface AddedTokenOptions { * @default False */ singleWord?: boolean; + /** + * Whether this token should match on the normalized version of the text. For example + * with the added token `yesterday` and a normalizer in charge of lowercasing the text, + * the input `I saw a lion Yesterday` would match the token. + * This is False for special tokens by default, true otherwise + * @default True + */ + normalized?: boolean; } /** @@ -404,9 +412,10 @@ export class AddedToken { /** * Instantiate a new AddedToken * @param content The content of the token + * @param special Whether this is a special token * @param [options] Options for the token */ - constructor(content: string, options?: AddedTokenOptions); + constructor(content: string, special: boolean, options?: AddedTokenOptions); /** * Get the content of the AddedToken diff --git a/bindings/node/lib/bindings/tokenizer.test.ts b/bindings/node/lib/bindings/tokenizer.test.ts index 6c66e337..7f45c870 100644 --- a/bindings/node/lib/bindings/tokenizer.test.ts +++ b/bindings/node/lib/bindings/tokenizer.test.ts @@ -32,17 +32,17 @@ import { describe("AddedToken", () => { it("instantiates with only content", () => { - const addToken = new AddedToken("test"); + const addToken = new AddedToken("test", false); expect(addToken.constructor.name).toEqual("AddedToken"); }); it("instantiates with empty options", () => { - const addToken = new AddedToken("test", {}); + const addToken = new AddedToken("test", false, {}); expect(addToken.constructor.name).toEqual("AddedToken"); }); it("instantiates with options", () => { - const addToken = new AddedToken("test", { + const addToken = new AddedToken("test", false, { leftStrip: true, rightStrip: true, singleWord: true @@ -52,7 +52,7 @@ describe("AddedToken", () => { describe("getContent", () => { it("returns the string content of AddedToken", () => { - const addedToken = new AddedToken("test"); + const addedToken = new AddedToken("test", false); expect(addedToken.getContent()).toEqual("test"); }); }); @@ -107,7 +107,7 @@ describe("Tokenizer", () => { it("accepts a list of AddedToken as new tokens when initial model is empty", () => { const model = BPE.empty(); const tokenizer = new Tokenizer(model); - const addedToken = new AddedToken("test"); + const addedToken = new AddedToken("test", false); const nbAdd = tokenizer.addTokens([addedToken]); expect(nbAdd).toBe(1); @@ -132,7 +132,7 @@ describe("Tokenizer", () => { const model = BPE.empty(); tokenizer = new Tokenizer(model); - tokenizer.addTokens(["my", "name", "is", "john", new AddedToken("pair")]); + tokenizer.addTokens(["my", "name", "is", "john", new AddedToken("pair", false)]); encode = promisify(tokenizer.encode.bind(tokenizer)); encodeBatch = promisify(tokenizer.encodeBatch.bind(tokenizer)); diff --git a/bindings/node/native/src/tokenizer.rs b/bindings/node/native/src/tokenizer.rs index fdd270d1..0dbdfb37 100644 --- a/bindings/node/native/src/tokenizer.rs +++ b/bindings/node/native/src/tokenizer.rs @@ -30,10 +30,11 @@ struct AddedTokenOptions { singleWord: Option, leftStrip: Option, rightStrip: Option, + normalized: Option, } impl AddedTokenOptions { - fn into_added_token(self, content: String) -> tk::AddedToken { - let mut token = tk::AddedToken::from(content); + fn into_added_token(self, content: String, special: bool) -> tk::AddedToken { + let mut token = tk::AddedToken::from(content, special); if let Some(sw) = self.singleWord { token = token.single_word(sw); } @@ -43,6 +44,9 @@ impl AddedTokenOptions { if let Some(rs) = self.rightStrip { token = token.rstrip(rs); } + if let Some(n) = self.normalized { + token = token.normalized(n); + } token } } @@ -52,18 +56,20 @@ declare_types! { init(mut cx) { // init( // content: string, + // special: boolean, // options?: { // singleWord?: boolean = false, // leftStrip?: boolean = false, // rightStrip?: boolean = false + // normalized?: boolean = true, // } // ) - let content = cx.extract::(0) - .map_err(|_| Error("First argument must be string".into()))?; - let token = cx.extract_opt::(1)? + let content = cx.extract::(0)?; + let special = cx.extract::(1)?; + let token = cx.extract_opt::(2)? .unwrap_or_else(AddedTokenOptions::default) - .into_added_token(content); + .into_added_token(content, special); Ok(AddedToken { token }) } @@ -87,7 +93,7 @@ impl FromJsValue for AddedToken { fn from_value<'c, C: Context<'c>>(from: Handle<'c, JsValue>, cx: &mut C) -> LibResult { if let Ok(token) = from.downcast::() { Ok(AddedToken { - token: tk::AddedToken::from(token.value()), + token: tk::AddedToken::from(token.value(), false), }) } else if let Ok(token) = from.downcast::() { let guard = cx.lock(); @@ -99,6 +105,21 @@ impl FromJsValue for AddedToken { } } +struct SpecialToken(tk::AddedToken); +impl FromJsValue for SpecialToken { + fn from_value<'c, C: Context<'c>>(from: Handle<'c, JsValue>, cx: &mut C) -> LibResult { + if let Ok(token) = from.downcast::() { + Ok(SpecialToken(tk::AddedToken::from(token.value(), true))) + } else if let Ok(token) = from.downcast::() { + let guard = cx.lock(); + let token = token.borrow(&guard); + Ok(SpecialToken(token.token.clone())) + } else { + Err(Error("Expected `string | AddedToken`".into())) + } + } +} + // encode & encodeBatch types struct TextInputSequence(tk::InputSequence); @@ -623,7 +644,7 @@ declare_types! { let this = cx.this(); let guard = cx.lock(); - let token = this.borrow(&guard).tokenizer.id_to_token(id); + let token = this.borrow(&guard).tokenizer.id_to_token(id).map(|t| t.to_owned()); if let Some(token) = token { Ok(cx.string(token).upcast()) @@ -650,9 +671,9 @@ declare_types! { method addSpecialTokens(mut cx) { // addSpecialTokens(tokens: (string | AddedToken)[]): number - let tokens = cx.extract_vec::(0)? + let tokens = cx.extract_vec::(0)? .into_iter() - .map(|token| token.into()) + .map(|token| token.0) .collect::>(); let mut this = cx.this(); diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs index e53a8472..f62bd3c8 100644 --- a/bindings/python/src/tokenizer.rs +++ b/bindings/python/src/tokenizer.rs @@ -29,7 +29,7 @@ impl AddedToken { #[new] #[args(kwargs = "**")] fn new(content: &str, is_special_token: bool, kwargs: Option<&PyDict>) -> PyResult { - let mut token = tk::tokenizer::AddedToken::from(content.to_owned(), is_special_token); + let mut token = tk::tokenizer::AddedToken::from(content, is_special_token); if let Some(kwargs) = kwargs { for (key, value) in kwargs { diff --git a/bindings/python/tokenizers/__init__.pyi b/bindings/python/tokenizers/__init__.pyi index 30fe0b1a..60641207 100644 --- a/bindings/python/tokenizers/__init__.pyi +++ b/bindings/python/tokenizers/__init__.pyi @@ -200,7 +200,13 @@ class AddedToken: """ def __new__( - cls, content: str, single_word: bool = False, lstrip: bool = False, rstrip: bool = False, + cls, + content: str, + is_special_token: bool, + single_word: bool = False, + lstrip: bool = False, + rstrip: bool = False, + normalized: bool = True, ) -> AddedToken: """ Instantiate a new AddedToken @@ -208,19 +214,30 @@ class AddedToken: content: str: The content of the token + is_special_token: bool: + Whether this token is a special token. This has an impact on the default value for + `normalized` which is False for special tokens, but True for others. + single_word: bool - Whether this token should only match against single word. If True, - this token will never match inside of a word. + Whether this token should only match against single words. If True, + this token will never match inside of a word. For example the token `ing` would + match on `tokenizing` if this option if False, but not if this option is True. lstrip: bool Whether this token should strip all potential whitespaces on the left side. - If True, this token will greedily match any whitespace on the left and then strip - them out. + If True, this token will greedily match any whitespace on the left. For example, + if we try to match the token `[MASK]` with lstrip=True, in the text `I saw a [MASK]` + we will match on ` [MASK]`. rstrip: bool Whether this token should strip all potential whitespaces on the right side. - If True, this token will greedily match any whitespace on the right and then strip - them out. + If True, this token will greedily match any whitespace on the right. It works just + like lstrip, but on the right. + + normalized: bool: + Whether this token should be match the normalized version of the input text. For + example, with the added token `yesterday` and a normalizer in charge of lowercasing + the text, the token could be extract from the input `I saw a lion Yesterday`. """ pass diff --git a/tokenizers/README.md b/tokenizers/README.md index 169c0d74..20370ffd 100644 --- a/tokenizers/README.md +++ b/tokenizers/README.md @@ -9,7 +9,7 @@ GitHub - Doc + Doc


@@ -56,22 +56,22 @@ fn main() -> Result<()>{ .vocab_size(vocab_size) .min_frequency(0) .special_tokens(vec![ - AddedToken::from("".into()), - AddedToken::from("".into()), - AddedToken::from("".into()), - AddedToken::from("".into()), - AddedToken::from("".into()), + AddedToken::from("", true), + AddedToken::from("", true), + AddedToken::from("", true), + AddedToken::from("", true), + AddedToken::from("", true), ]) .build(), ); - + let mut tokenizer = Tokenizer::new(Box::new(BPE::default())); tokenizer.with_normalizer(Box::new(Sequence::new(vec![ Box::new(Strip::new(true, true)), Box::new(NFC), ]))); tokenizer.with_pre_tokenizer(Box::new(ByteLevel::default())); - + tokenizer.train(&trainer, vec!["/path/to/train.txt".to_string()])?; tokenizer.save("/path/to/trained_tokenizer", true)?; @@ -86,7 +86,7 @@ use tokenizers::Result; use tokenizers::tokenizer::Tokenizer; fn main() -> Result<()>{ - + let tokenizer = Tokenizer::from_file("/path/to/trained_tokenizer")?; let sample_encoding = tokenizer.encode("Huggingface", false)?; diff --git a/tokenizers/benches/bpe_benchmark.rs b/tokenizers/benches/bpe_benchmark.rs index d66018fd..31f0c5f7 100644 --- a/tokenizers/benches/bpe_benchmark.rs +++ b/tokenizers/benches/bpe_benchmark.rs @@ -17,9 +17,8 @@ fn create_gpt2_tokenizer(bpe: BPE) -> Tokenizer { let mut tokenizer = Tokenizer::new(Box::new(bpe)); tokenizer.with_pre_tokenizer(Box::new(ByteLevel::default())); tokenizer.with_decoder(Box::new(ByteLevel::default())); - tokenizer.add_tokens(&[AddedToken::from(String::from("ing"), false).single_word(false)]); - tokenizer - .add_special_tokens(&[AddedToken::from(String::from("[ENT]"), true).single_word(true)]); + tokenizer.add_tokens(&[AddedToken::from("ing", false).single_word(false)]); + tokenizer.add_special_tokens(&[AddedToken::from("[ENT]", true).single_word(true)]); tokenizer } diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index 8b02da0a..80926f32 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -20,13 +20,14 @@ pub struct AddedToken { /// Whether this token should be normalized pub normalized: bool, } + impl AddedToken { /// Build this token from the given content, specifying if it is intented to be a /// special token. Special tokens are not normalized by default. - pub fn from(content: String, special_token: bool) -> Self { + pub fn from>(content: S, special: bool) -> Self { AddedToken { - content, - normalized: !special_token, + content: content.into(), + normalized: !special, ..Default::default() } } @@ -48,7 +49,7 @@ impl AddedToken { self.rstrip = rstrip; self } - /// Specify whether this token should be normalized, and/or match against its normalized + /// Specify whether this token should be normalized and match against its normalized /// version in the input text. pub fn normalized(mut self, normalized: bool) -> Self { self.normalized = normalized; @@ -108,7 +109,7 @@ impl Default for AddedToken { single_word: false, lstrip: false, rstrip: false, - normalized: false, + normalized: true, } } } @@ -144,22 +145,22 @@ type MatchingSet = (regex::RegexSet, Vec); /// exist as required. /// pub(super) struct AddedVocabulary { - /// The size of the original vocabulary. This is what we use to determine the new - /// ids we need to generate - original_vocab_size: usize, - /// Contains the mapping from String to ID as the user intended it. This map - /// contains both special tokens and classic added tokens. + /// Contains the mapping from String (token content) to ID. This map contains both special + /// tokens and classic added tokens that were added to the this vocabulary. added_tokens_map: HashMap, /// Contains the mapping from ID to AddedToken for all the added tokens, both special /// and classic. added_tokens_map_r: HashMap, + /// Contains only the classic AddedToken, in the specific order the user gave them. added_tokens: Vec, /// Contains only the special AddedToken, in the specific order the user gave them. special_tokens: Vec, + /// A Set, containing all the special token for easy access while decoding. This let's - /// use remove them easily with an O(1) complexity. + /// us remove them easily with an O(1) complexity. special_tokens_set: HashSet, + /// A RegexSet containing all the non-normalized patterns used to split on AddedTokens split_re: MatchingSet, /// A RegexSet containing all the normalized patterns used to split on AddedTokens @@ -167,9 +168,8 @@ pub(super) struct AddedVocabulary { } impl AddedVocabulary { - pub fn new(original_vocab_size: usize) -> Self { + pub fn new() -> Self { Self { - original_vocab_size, added_tokens_map: HashMap::new(), added_tokens_map_r: HashMap::new(), added_tokens: vec![], @@ -180,12 +180,6 @@ impl AddedVocabulary { } } - /// Sets the original vocabulary size. We need this value to return IDs that - /// are shifted according to the original vocabulary. - pub fn update_original_vocab_size(&mut self, size: usize) { - self.original_vocab_size = size; - } - /// Size of the additional vocabulary pub fn len(&self) -> usize { self.added_tokens_map.len() @@ -252,7 +246,7 @@ impl AddedVocabulary { ignored += 1; id } else { - let new_id = (self.original_vocab_size + self.added_tokens_map.len()) as u32; + let new_id = (model.get_vocab_size() + self.added_tokens_map.len()) as u32; self.added_tokens_map.insert(token.content.clone(), new_id); if !self.special_tokens_set.contains(&token.content) { @@ -400,7 +394,6 @@ impl AddedVocabulary { splits .into_iter() .map(|(idx, (start, end))| { - // TODO: Check this works (especially for offsets) let normalized = sentence .slice_bytes(Range::Normalized(start..end)) .expect("Error while extracting normalized Range"); @@ -472,7 +465,6 @@ impl Serialize for AddedVocabulary { .added_tokens_map_r .iter() .map(|(id, token)| AddedTokenWithId { - // TODO: Make sure these are the right IDs (related to the model) id: *id, special: self.special_tokens_set.contains(&token.content), token: token.clone(), @@ -488,3 +480,211 @@ impl Serialize for AddedVocabulary { vocabulary.end() } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::normalizers::utils::Lowercase; + use crate::{Offsets, Result, Token}; + use std::path::{Path, PathBuf}; + + #[derive(Serialize, Deserialize)] + struct ModelMock { + vocab: HashMap, + vocab_r: HashMap, + } + impl ModelMock { + pub fn new(iter: I) -> Self + where + I: IntoIterator, + { + let vocab: HashMap = iter + .into_iter() + .map(|&(tok, id)| (tok.to_string(), id)) + .collect(); + Self { + vocab_r: vocab + .iter() + .map(|(tok, id)| (*id, tok.to_owned())) + .collect(), + vocab, + } + } + } + #[typetag::serde] + impl Model for ModelMock { + fn tokenize(&self, _tokens: Vec<(String, Offsets)>) -> Result> { + unimplemented!() + } + fn token_to_id(&self, token: &str) -> Option { + self.vocab.get(token).copied() + } + fn id_to_token(&self, id: u32) -> Option<&str> { + self.vocab_r.get(&id).map(String::as_ref) + } + fn get_vocab(&self) -> &HashMap { + &self.vocab + } + fn get_vocab_size(&self) -> usize { + self.vocab.len() + } + fn save(&self, _folder: &Path, _name: Option<&str>) -> Result> { + unimplemented!() + } + } + + #[test] + fn can_add_tokens() { + let model = ModelMock::new(&[("test", 0), ("tost", 1)]); + let mut vocab = AddedVocabulary::new(); + + // Add tokens normally + assert_eq!( + vocab.add_tokens(&[AddedToken::from("added_token_1", false)], &model, None), + 1 + ); + assert_eq!(vocab.len(), 1); + + // Does not add multiple time the same token + assert_eq!( + vocab.add_tokens( + &[ + AddedToken::from("added_token_2", false), + AddedToken::from("added_token_2", false) + ], + &model, + None + ), + 1 + ); + assert_eq!(vocab.len(), 2); + + // Does not add tokens already covered by the model + assert_eq!( + vocab.add_tokens(&[AddedToken::from("test", false)], &model, None), + 0 + ); + assert_eq!(vocab.len(), 2); + } + + #[test] + fn can_add_special_tokens() { + let model = ModelMock::new(&[("test", 0), ("tost", 1)]); + let mut vocab = AddedVocabulary::new(); + + // Add tokens normally + assert_eq!( + vocab.add_special_tokens(&[AddedToken::from("added_token_1", true)], &model, None), + 1 + ); + assert_eq!(vocab.len(), 1); + + // Does not add multiple time the same token + assert_eq!( + vocab.add_special_tokens( + &[ + AddedToken::from("added_token_2", true), + AddedToken::from("added_token_2", true) + ], + &model, + None + ), + 1 + ); + assert_eq!(vocab.len(), 2); + + // Can add tokens already covered by the model + assert_eq!( + vocab.add_special_tokens(&[AddedToken::from("test", true)], &model, None), + 0 + ); + assert_eq!(vocab.len(), 2); // Did not add a new token, since it exist in the original model + assert_eq!(vocab.is_special_token("test"), true); + assert_eq!(vocab.added_tokens_map.contains_key("test"), false); + } + + #[test] + fn can_extract_added_tokens() { + // Is able to extract both normal and special tokens + let model = ModelMock::new(&[]); + let mut vocab = AddedVocabulary::new(); + + vocab.add_tokens( + &[ + AddedToken::from("my", false), + AddedToken::from("name", false), + ], + &model, + None, + ); + vocab.add_special_tokens( + &[ + AddedToken::from("[CLS]", true), + AddedToken::from("[SEP]", true), + ], + &model, + None, + ); + + let result = vocab.extract_and_normalize(None, "[CLS] My name is Anthony [SEP]"); + assert_eq!( + result + .iter() + .map(|(normalized, id)| (normalized.get(), *id)) + .collect::>(), + vec![ + ("[CLS]", Some(2)), + (" My ", None), + ("name", Some(1)), + (" is Anthony ", None), + ("[SEP]", Some(3)) + ] + ); + } + + #[test] + fn options_use_cases() { + // Is able to extract both normal and special tokens, with various options (lstrip, rstrip, + // single_word, normalized) + let model = ModelMock::new(&[]); + let normalizer = Lowercase; + let mut vocab = AddedVocabulary::new(); + + vocab.add_tokens( + &[ + AddedToken::from("my", false).lstrip(true).rstrip(true), + AddedToken::from("name", false), + AddedToken::from("ony", false).single_word(true), + ], + &model, + Some(&normalizer), + ); + vocab.add_special_tokens( + &[ + AddedToken::from("[CLS]", true), + AddedToken::from("[SEP]", true), + ], + &model, + Some(&normalizer), + ); + + let result = + vocab.extract_and_normalize(Some(&normalizer), "[CLS] My name is Anthony [SEP]"); + assert_eq!( + result + .iter() + .map(|(normalized, id)| (normalized.get(), *id)) + .collect::>(), + vec![ + ("[CLS]", Some(3)), + // This one includes both spaces because of the lstrip & rstrip + // And it matches because normalized == true + (" my ", Some(0)), + ("name", Some(1)), + // `ony` is not extracted here thanks to single_word + (" is anthony ", None), + ("[SEP]", Some(4)) + ] + ); + } +} diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index fc90c121..be98a821 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -211,7 +211,6 @@ impl std::str::FromStr for Tokenizer { impl Tokenizer { /// Instantiate a new Tokenizer, with the given Model pub fn new(model: Box) -> Self { - let original_vocab_size = model.get_vocab_size(); Tokenizer { normalizer: None, pre_tokenizer: None, @@ -219,7 +218,7 @@ impl Tokenizer { post_processor: None, decoder: None, - added_vocabulary: AddedVocabulary::new(original_vocab_size), + added_vocabulary: AddedVocabulary::new(), truncation: None, padding: None, @@ -303,8 +302,6 @@ impl Tokenizer { /// Set the model pub fn with_model(&mut self, model: Box) -> &Self { self.model = model; - self.added_vocabulary - .update_original_vocab_size(self.model.get_vocab_size()); self } @@ -669,8 +666,6 @@ impl Tokenizer { let (model, special_tokens) = trainer.train(words)?; self.model = model; - self.added_vocabulary - .update_original_vocab_size(self.model.get_vocab_size()); self.add_special_tokens(&special_tokens); Ok(()) diff --git a/tokenizers/src/tokenizer/normalizer.rs b/tokenizers/src/tokenizer/normalizer.rs index 01969cc9..08866970 100644 --- a/tokenizers/src/tokenizer/normalizer.rs +++ b/tokenizers/src/tokenizer/normalizer.rs @@ -50,7 +50,7 @@ where /// It is possible to retrieve a part of the original string, by indexing it with offsets from the /// normalized one, and the other way around too. It is also possible to convert offsets from one /// referential to the other one easily. -#[derive(Default, Debug, Clone)] +#[derive(Default, Debug, Clone, PartialEq)] pub struct NormalizedString { /// The original version of the string, before any modification original: String, @@ -61,12 +61,6 @@ pub struct NormalizedString { alignments: Vec<(usize, usize)>, } -impl std::cmp::PartialEq for NormalizedString { - fn eq(&self, other: &NormalizedString) -> bool { - self.normalized == other.normalized - } -} - impl NormalizedString { /// Create a NormalizedString from the given str pub fn from(s: &str) -> Self { @@ -441,7 +435,7 @@ impl NormalizedString { /// Merge with the given NormalizedString by appending it to self pub fn merge_with(&mut self, other: &NormalizedString) { self.original.push_str(&other.original); - let len = self.len(); + let len = self.len() - 1; self.alignments.extend( other .alignments @@ -879,7 +873,7 @@ mod tests { Some(NormalizedString { original: "𝕞𝕠𝕣𝕟𝕚𝕟𝕘".to_string(), normalized: "morning".to_string(), - alignments: vec![(5, 6), (6, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 12)] + alignments: vec![(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7)] }) ); assert_eq!( diff --git a/tokenizers/tests/added_tokens.rs b/tokenizers/tests/added_tokens.rs index e1f69c1e..e2153f0a 100644 --- a/tokenizers/tests/added_tokens.rs +++ b/tokenizers/tests/added_tokens.rs @@ -9,8 +9,8 @@ fn add_tokens() { assert_eq!( tokenizer.add_special_tokens(&[ - AddedToken::from("".into(), true), - AddedToken::from("".into(), true) + AddedToken::from("", true), + AddedToken::from("", true) ]), 2 ); @@ -19,8 +19,8 @@ fn add_tokens() { assert_eq!( tokenizer.add_tokens(&[ - AddedToken::from("hello".into(), false), - AddedToken::from("world".into(), false) + AddedToken::from("hello", false), + AddedToken::from("world", false) ]), 2 ); @@ -31,7 +31,7 @@ fn add_tokens() { #[test] fn lstrip_tokens() { let mut tokenizer = get_byte_level(true, false); - tokenizer.add_special_tokens(&[AddedToken::from("".into(), true).lstrip(true)]); + tokenizer.add_special_tokens(&[AddedToken::from("", true).lstrip(true)]); let input = "I saw a 😺"; let output = tokenizer.encode(input, false).unwrap(); @@ -49,7 +49,7 @@ fn lstrip_tokens() { #[test] fn rstrip_tokens() { let mut tokenizer = get_byte_level(false, false); - tokenizer.add_special_tokens(&[AddedToken::from("".into(), true).rstrip(true)]); + tokenizer.add_special_tokens(&[AddedToken::from("", true).rstrip(true)]); let input = "I saw a 😺"; let output = tokenizer.encode(input, false).unwrap(); @@ -62,7 +62,7 @@ fn rstrip_tokens() { // When `add_prefix_space = true` rstrip cannot work as a prefix space is added // to the next token let mut tokenizer = get_byte_level(true, false); - tokenizer.add_special_tokens(&[AddedToken::from("".into(), true).rstrip(true)]); + tokenizer.add_special_tokens(&[AddedToken::from("", true).rstrip(true)]); let input = "I saw a 😺"; let output = tokenizer.encode(input, false).unwrap(); @@ -77,7 +77,7 @@ fn rstrip_tokens() { fn single_word_tokens() { // If `single_word = true` it shouldn't split `dancing` let mut tokenizer = get_byte_level(false, false); - tokenizer.add_special_tokens(&[AddedToken::from("ing".into(), true).single_word(true)]); + tokenizer.add_special_tokens(&[AddedToken::from("ing", true).single_word(true)]); let input = "I like dancing"; let output = tokenizer.encode(input, false).unwrap(); @@ -86,7 +86,7 @@ fn single_word_tokens() { // If `single_word = false` it should split `dancing` let mut tokenizer = get_byte_level(false, false); - tokenizer.add_special_tokens(&[AddedToken::from("ing".into(), true).single_word(false)]); + tokenizer.add_special_tokens(&[AddedToken::from("ing", true).single_word(false)]); let input = "I like dancing"; let output = tokenizer.encode(input, false).unwrap(); @@ -98,9 +98,9 @@ fn single_word_tokens() { fn overlapping_tokens() { let mut tokenizer = get_byte_level(false, false); - tokenizer.add_special_tokens(&[AddedToken::from("danc".into(), true)]); - tokenizer.add_special_tokens(&[AddedToken::from("nci".into(), true)]); - tokenizer.add_special_tokens(&[AddedToken::from("ing".into(), true)]); + tokenizer.add_special_tokens(&[AddedToken::from("danc", true)]); + tokenizer.add_special_tokens(&[AddedToken::from("nci", true)]); + tokenizer.add_special_tokens(&[AddedToken::from("ing", true)]); let input = "I like dancing"; let output = tokenizer.encode(input, false).unwrap(); @@ -109,10 +109,10 @@ fn overlapping_tokens() { let mut tokenizer = get_byte_level(false, false); - tokenizer.add_special_tokens(&[AddedToken::from("nci".into(), true)]); - tokenizer.add_special_tokens(&[AddedToken::from("danc".into(), true)]); - tokenizer.add_special_tokens(&[AddedToken::from("ing".into(), true)]); - tokenizer.add_special_tokens(&[AddedToken::from("ike".into(), true)]); + tokenizer.add_special_tokens(&[AddedToken::from("nci", true)]); + tokenizer.add_special_tokens(&[AddedToken::from("danc", true)]); + tokenizer.add_special_tokens(&[AddedToken::from("ing", true)]); + tokenizer.add_special_tokens(&[AddedToken::from("ike", true)]); let output = tokenizer.encode(input, false).unwrap(); diff --git a/tokenizers/tests/offsets.rs b/tokenizers/tests/offsets.rs index 29074bc1..88ce4118 100644 --- a/tokenizers/tests/offsets.rs +++ b/tokenizers/tests/offsets.rs @@ -158,7 +158,7 @@ fn split_on_added_tokens_bert() { let input = "Yesterday I saw a [MASK] far away"; let mut tokenizer = get_bert(); - tokenizer.add_special_tokens(&[AddedToken::from("[MASK]".into(), true)]); + tokenizer.add_special_tokens(&[AddedToken::from("[MASK]", true)]); let output = tokenizer.encode(input, false).unwrap(); assert_eq!(