diff --git a/bindings/node/lib/bindings/tokenizer.d.ts b/bindings/node/lib/bindings/tokenizer.d.ts
index f0db31ca..cd4a5185 100644
--- a/bindings/node/lib/bindings/tokenizer.d.ts
+++ b/bindings/node/lib/bindings/tokenizer.d.ts
@@ -392,6 +392,14 @@ export interface AddedTokenOptions {
    * @default False
    */
   singleWord?: boolean;
+  /**
+   * Whether this token should match on the normalized version of the text. For example
+   * with the added token `yesterday` and a normalizer in charge of lowercasing the text,
+   * the input `I saw a lion Yesterday` would match the token.
+   * This is False for special tokens by default, true otherwise
+   * @default True
+   */
+  normalized?: boolean;
 }
 
 /**
@@ -404,9 +412,10 @@ export class AddedToken {
   /**
    * Instantiate a new AddedToken
    * @param content The content of the token
+   * @param special Whether this is a special token
    * @param [options] Options for the token
    */
-  constructor(content: string, options?: AddedTokenOptions);
+  constructor(content: string, special: boolean, options?: AddedTokenOptions);
 
   /**
    * Get the content of the AddedToken
diff --git a/bindings/node/lib/bindings/tokenizer.test.ts b/bindings/node/lib/bindings/tokenizer.test.ts
index 6c66e337..7f45c870 100644
--- a/bindings/node/lib/bindings/tokenizer.test.ts
+++ b/bindings/node/lib/bindings/tokenizer.test.ts
@@ -32,17 +32,17 @@ import {
 
 describe("AddedToken", () => {
   it("instantiates with only content", () => {
-    const addToken = new AddedToken("test");
+    const addToken = new AddedToken("test", false);
     expect(addToken.constructor.name).toEqual("AddedToken");
   });
 
   it("instantiates with empty options", () => {
-    const addToken = new AddedToken("test", {});
+    const addToken = new AddedToken("test", false, {});
     expect(addToken.constructor.name).toEqual("AddedToken");
   });
 
   it("instantiates with options", () => {
-    const addToken = new AddedToken("test", {
+    const addToken = new AddedToken("test", false, {
       leftStrip: true,
       rightStrip: true,
       singleWord: true
@@ -52,7 +52,7 @@ describe("AddedToken", () => {
 
   describe("getContent", () => {
     it("returns the string content of AddedToken", () => {
-      const addedToken = new AddedToken("test");
+      const addedToken = new AddedToken("test", false);
       expect(addedToken.getContent()).toEqual("test");
     });
   });
@@ -107,7 +107,7 @@ describe("Tokenizer", () => {
     it("accepts a list of AddedToken as new tokens when initial model is empty", () => {
       const model = BPE.empty();
       const tokenizer = new Tokenizer(model);
-      const addedToken = new AddedToken("test");
+      const addedToken = new AddedToken("test", false);
 
       const nbAdd = tokenizer.addTokens([addedToken]);
       expect(nbAdd).toBe(1);
@@ -132,7 +132,7 @@ describe("Tokenizer", () => {
 
       const model = BPE.empty();
       tokenizer = new Tokenizer(model);
-      tokenizer.addTokens(["my", "name", "is", "john", new AddedToken("pair")]);
+      tokenizer.addTokens(["my", "name", "is", "john", new AddedToken("pair", false)]);
 
       encode = promisify(tokenizer.encode.bind(tokenizer));
       encodeBatch = promisify(tokenizer.encodeBatch.bind(tokenizer));
diff --git a/bindings/node/native/src/tokenizer.rs b/bindings/node/native/src/tokenizer.rs
index fdd270d1..0dbdfb37 100644
--- a/bindings/node/native/src/tokenizer.rs
+++ b/bindings/node/native/src/tokenizer.rs
@@ -30,10 +30,11 @@ struct AddedTokenOptions {
     singleWord: Option<bool>,
     leftStrip: Option<bool>,
     rightStrip: Option<bool>,
+    normalized: Option<bool>,
 }
 impl AddedTokenOptions {
-    fn into_added_token(self, content: String) -> tk::AddedToken {
-        let mut token = tk::AddedToken::from(content);
+    fn into_added_token(self, content: String, special: bool) -> tk::AddedToken {
+        let mut token = tk::AddedToken::from(content, special);
         if let Some(sw) = self.singleWord {
             token = token.single_word(sw);
         }
@@ -43,6 +44,9 @@ impl AddedTokenOptions {
         if let Some(rs) = self.rightStrip {
             token = token.rstrip(rs);
         }
+        if let Some(n) = self.normalized {
+            token = token.normalized(n);
+        }
         token
     }
 }
@@ -52,18 +56,20 @@ declare_types! {
         init(mut cx) {
             // init(
             //  content: string,
+            //  special: boolean,
             //  options?: {
             //    singleWord?: boolean = false,
             //    leftStrip?: boolean = false,
             //    rightStrip?: boolean = false
+            //    normalized?: boolean = true,
             //  }
             // )
 
-            let content = cx.extract::<String>(0)
-                .map_err(|_| Error("First argument must be string".into()))?;
-            let token = cx.extract_opt::<AddedTokenOptions>(1)?
+            let content = cx.extract::<String>(0)?;
+            let special = cx.extract::<bool>(1)?;
+            let token = cx.extract_opt::<AddedTokenOptions>(2)?
                 .unwrap_or_else(AddedTokenOptions::default)
-                .into_added_token(content);
+                .into_added_token(content, special);
 
             Ok(AddedToken { token })
         }
@@ -87,7 +93,7 @@ impl FromJsValue for AddedToken {
     fn from_value<'c, C: Context<'c>>(from: Handle<'c, JsValue>, cx: &mut C) -> LibResult<Self> {
         if let Ok(token) = from.downcast::<JsString>() {
             Ok(AddedToken {
-                token: tk::AddedToken::from(token.value()),
+                token: tk::AddedToken::from(token.value(), false),
             })
         } else if let Ok(token) = from.downcast::<JsAddedToken>() {
             let guard = cx.lock();
@@ -99,6 +105,21 @@ impl FromJsValue for AddedToken {
     }
 }
 
+struct SpecialToken(tk::AddedToken);
+impl FromJsValue for SpecialToken {
+    fn from_value<'c, C: Context<'c>>(from: Handle<'c, JsValue>, cx: &mut C) -> LibResult<Self> {
+        if let Ok(token) = from.downcast::<JsString>() {
+            Ok(SpecialToken(tk::AddedToken::from(token.value(), true)))
+        } else if let Ok(token) = from.downcast::<JsAddedToken>() {
+            let guard = cx.lock();
+            let token = token.borrow(&guard);
+            Ok(SpecialToken(token.token.clone()))
+        } else {
+            Err(Error("Expected `string | AddedToken`".into()))
+        }
+    }
+}
+
 // encode & encodeBatch types
 
 struct TextInputSequence(tk::InputSequence);
@@ -623,7 +644,7 @@ declare_types! {
 
             let this = cx.this();
             let guard = cx.lock();
-            let token = this.borrow(&guard).tokenizer.id_to_token(id);
+            let token = this.borrow(&guard).tokenizer.id_to_token(id).map(|t| t.to_owned());
 
             if let Some(token) = token {
                 Ok(cx.string(token).upcast())
@@ -650,9 +671,9 @@ declare_types! {
         method addSpecialTokens(mut cx) {
             // addSpecialTokens(tokens: (string | AddedToken)[]): number
 
-            let tokens = cx.extract_vec::<AddedToken>(0)?
+            let tokens = cx.extract_vec::<SpecialToken>(0)?
                 .into_iter()
-                .map(|token| token.into())
+                .map(|token| token.0)
                 .collect::<Vec<_>>();
 
             let mut this = cx.this();
diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs
index e53a8472..f62bd3c8 100644
--- a/bindings/python/src/tokenizer.rs
+++ b/bindings/python/src/tokenizer.rs
@@ -29,7 +29,7 @@ impl AddedToken {
     #[new]
     #[args(kwargs = "**")]
     fn new(content: &str, is_special_token: bool, kwargs: Option<&PyDict>) -> PyResult<Self> {
-        let mut token = tk::tokenizer::AddedToken::from(content.to_owned(), is_special_token);
+        let mut token = tk::tokenizer::AddedToken::from(content, is_special_token);
 
         if let Some(kwargs) = kwargs {
             for (key, value) in kwargs {
diff --git a/bindings/python/tokenizers/__init__.pyi b/bindings/python/tokenizers/__init__.pyi
index 30fe0b1a..60641207 100644
--- a/bindings/python/tokenizers/__init__.pyi
+++ b/bindings/python/tokenizers/__init__.pyi
@@ -200,7 +200,13 @@ class AddedToken:
     """
 
     def __new__(
-        cls, content: str, single_word: bool = False, lstrip: bool = False, rstrip: bool = False,
+        cls,
+        content: str,
+        is_special_token: bool,
+        single_word: bool = False,
+        lstrip: bool = False,
+        rstrip: bool = False,
+        normalized: bool = True,
     ) -> AddedToken:
         """ Instantiate a new AddedToken
 
@@ -208,19 +214,30 @@ class AddedToken:
             content: str:
                 The content of the token
 
+            is_special_token: bool:
+                Whether this token is a special token. This has an impact on the default value for
+                `normalized` which is False for special tokens, but True for others.
+
             single_word: bool
-                Whether this token should only match against single word. If True,
-                this token will never match inside of a word.
+                Whether this token should only match against single words. If True,
+                this token will never match inside of a word. For example the token `ing` would
+                match on `tokenizing` if this option if False, but not if this option is True.
 
             lstrip: bool
                 Whether this token should strip all potential whitespaces on the left side.
-                If True, this token will greedily match any whitespace on the left and then strip
-                them out.
+                If True, this token will greedily match any whitespace on the left. For example,
+                if we try to match the token `[MASK]` with lstrip=True, in the text `I saw a [MASK]`
+                we will match on ` [MASK]`.
 
             rstrip: bool
                 Whether this token should strip all potential whitespaces on the right side.
-                If True, this token will greedily match any whitespace on the right and then strip
-                them out.
+                If True, this token will greedily match any whitespace on the right. It works just
+                like lstrip, but on the right.
+
+            normalized: bool:
+                Whether this token should be match the normalized version of the input text. For
+                example, with the added token `yesterday` and a normalizer in charge of lowercasing
+                the text, the token could be extract from the input `I saw a lion Yesterday`.
         """
         pass
 
diff --git a/tokenizers/README.md b/tokenizers/README.md
index 169c0d74..20370ffd 100644
--- a/tokenizers/README.md
+++ b/tokenizers/README.md
@@ -9,7 +9,7 @@
         <img alt="GitHub" src="https://img.shields.io/github/license/huggingface/tokenizers.svg?color=blue">
     </a>
     <a href="https://docs.rs/tokenizers/">
-        <img alt="Doc" src="https://docs.rs/tokenizers/badge.svg">    
+        <img alt="Doc" src="https://docs.rs/tokenizers/badge.svg">
     </a>
 </p>
 <br>
@@ -56,22 +56,22 @@ fn main() -> Result<()>{
             .vocab_size(vocab_size)
             .min_frequency(0)
             .special_tokens(vec![
-                AddedToken::from("<s>".into()),
-                AddedToken::from("<pad>".into()),
-                AddedToken::from("</s>".into()),
-                AddedToken::from("<unk>".into()),
-                AddedToken::from("<mask>".into()),
+                AddedToken::from("<s>", true),
+                AddedToken::from("<pad>", true),
+                AddedToken::from("</s>", true),
+                AddedToken::from("<unk>", true),
+                AddedToken::from("<mask>", true),
             ])
             .build(),
     );
-                                                                  
+
     let mut tokenizer = Tokenizer::new(Box::new(BPE::default()));
     tokenizer.with_normalizer(Box::new(Sequence::new(vec![
         Box::new(Strip::new(true, true)),
         Box::new(NFC),
     ])));
     tokenizer.with_pre_tokenizer(Box::new(ByteLevel::default()));
-                                                                  
+
     tokenizer.train(&trainer, vec!["/path/to/train.txt".to_string()])?;
     tokenizer.save("/path/to/trained_tokenizer", true)?;
 
@@ -86,7 +86,7 @@ use tokenizers::Result;
 use tokenizers::tokenizer::Tokenizer;
 
 fn main() -> Result<()>{
-                                                                  
+
     let tokenizer = Tokenizer::from_file("/path/to/trained_tokenizer")?;
 
     let sample_encoding = tokenizer.encode("Huggingface", false)?;
diff --git a/tokenizers/benches/bpe_benchmark.rs b/tokenizers/benches/bpe_benchmark.rs
index d66018fd..31f0c5f7 100644
--- a/tokenizers/benches/bpe_benchmark.rs
+++ b/tokenizers/benches/bpe_benchmark.rs
@@ -17,9 +17,8 @@ fn create_gpt2_tokenizer(bpe: BPE) -> Tokenizer {
     let mut tokenizer = Tokenizer::new(Box::new(bpe));
     tokenizer.with_pre_tokenizer(Box::new(ByteLevel::default()));
     tokenizer.with_decoder(Box::new(ByteLevel::default()));
-    tokenizer.add_tokens(&[AddedToken::from(String::from("ing"), false).single_word(false)]);
-    tokenizer
-        .add_special_tokens(&[AddedToken::from(String::from("[ENT]"), true).single_word(true)]);
+    tokenizer.add_tokens(&[AddedToken::from("ing", false).single_word(false)]);
+    tokenizer.add_special_tokens(&[AddedToken::from("[ENT]", true).single_word(true)]);
     tokenizer
 }
 
diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs
index 8b02da0a..80926f32 100644
--- a/tokenizers/src/tokenizer/added_vocabulary.rs
+++ b/tokenizers/src/tokenizer/added_vocabulary.rs
@@ -20,13 +20,14 @@ pub struct AddedToken {
     /// Whether this token should be normalized
     pub normalized: bool,
 }
+
 impl AddedToken {
     /// Build this token from the given content, specifying if it is intented to be a
     /// special token. Special tokens are not normalized by default.
-    pub fn from(content: String, special_token: bool) -> Self {
+    pub fn from<S: Into<String>>(content: S, special: bool) -> Self {
         AddedToken {
-            content,
-            normalized: !special_token,
+            content: content.into(),
+            normalized: !special,
             ..Default::default()
         }
     }
@@ -48,7 +49,7 @@ impl AddedToken {
         self.rstrip = rstrip;
         self
     }
-    /// Specify whether this token should be normalized, and/or match against its normalized
+    /// Specify whether this token should be normalized and match against its normalized
     /// version in the input text.
     pub fn normalized(mut self, normalized: bool) -> Self {
         self.normalized = normalized;
@@ -108,7 +109,7 @@ impl Default for AddedToken {
             single_word: false,
             lstrip: false,
             rstrip: false,
-            normalized: false,
+            normalized: true,
         }
     }
 }
@@ -144,22 +145,22 @@ type MatchingSet = (regex::RegexSet, Vec<u32>);
 /// exist as required.
 ///
 pub(super) struct AddedVocabulary {
-    /// The size of the original vocabulary. This is what we use to determine the new
-    /// ids we need to generate
-    original_vocab_size: usize,
-    /// Contains the mapping from String to ID as the user intended it. This map
-    /// contains both special tokens and classic added tokens.
+    /// Contains the mapping from String (token content) to ID. This map contains both special
+    /// tokens and classic added tokens that were added to the this vocabulary.
     added_tokens_map: HashMap<String, u32>,
     /// Contains the mapping from ID to AddedToken for all the added tokens, both special
     /// and classic.
     added_tokens_map_r: HashMap<u32, AddedToken>,
+
     /// Contains only the classic AddedToken, in the specific order the user gave them.
     added_tokens: Vec<AddedToken>,
     /// Contains only the special AddedToken, in the specific order the user gave them.
     special_tokens: Vec<AddedToken>,
+
     /// A Set, containing all the special token for easy access while decoding. This let's
-    /// use remove them easily with an O(1) complexity.
+    /// us remove them easily with an O(1) complexity.
     special_tokens_set: HashSet<String>,
+
     /// A RegexSet containing all the non-normalized patterns used to split on AddedTokens
     split_re: MatchingSet,
     /// A RegexSet containing all the normalized patterns used to split on AddedTokens
@@ -167,9 +168,8 @@ pub(super) struct AddedVocabulary {
 }
 
 impl AddedVocabulary {
-    pub fn new(original_vocab_size: usize) -> Self {
+    pub fn new() -> Self {
         Self {
-            original_vocab_size,
             added_tokens_map: HashMap::new(),
             added_tokens_map_r: HashMap::new(),
             added_tokens: vec![],
@@ -180,12 +180,6 @@ impl AddedVocabulary {
         }
     }
 
-    /// Sets the original vocabulary size. We need this value to return IDs that
-    /// are shifted according to the original vocabulary.
-    pub fn update_original_vocab_size(&mut self, size: usize) {
-        self.original_vocab_size = size;
-    }
-
     /// Size of the additional vocabulary
     pub fn len(&self) -> usize {
         self.added_tokens_map.len()
@@ -252,7 +246,7 @@ impl AddedVocabulary {
                 ignored += 1;
                 id
             } else {
-                let new_id = (self.original_vocab_size + self.added_tokens_map.len()) as u32;
+                let new_id = (model.get_vocab_size() + self.added_tokens_map.len()) as u32;
                 self.added_tokens_map.insert(token.content.clone(), new_id);
 
                 if !self.special_tokens_set.contains(&token.content) {
@@ -400,7 +394,6 @@ impl AddedVocabulary {
             splits
                 .into_iter()
                 .map(|(idx, (start, end))| {
-                    // TODO: Check this works (especially for offsets)
                     let normalized = sentence
                         .slice_bytes(Range::Normalized(start..end))
                         .expect("Error while extracting normalized Range");
@@ -472,7 +465,6 @@ impl Serialize for AddedVocabulary {
             .added_tokens_map_r
             .iter()
             .map(|(id, token)| AddedTokenWithId {
-                // TODO: Make sure these are the right IDs (related to the model)
                 id: *id,
                 special: self.special_tokens_set.contains(&token.content),
                 token: token.clone(),
@@ -488,3 +480,211 @@ impl Serialize for AddedVocabulary {
         vocabulary.end()
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::normalizers::utils::Lowercase;
+    use crate::{Offsets, Result, Token};
+    use std::path::{Path, PathBuf};
+
+    #[derive(Serialize, Deserialize)]
+    struct ModelMock {
+        vocab: HashMap<String, u32>,
+        vocab_r: HashMap<u32, String>,
+    }
+    impl ModelMock {
+        pub fn new<I>(iter: I) -> Self
+        where
+            I: IntoIterator<Item = &'static (&'static str, u32)>,
+        {
+            let vocab: HashMap<String, u32> = iter
+                .into_iter()
+                .map(|&(tok, id)| (tok.to_string(), id))
+                .collect();
+            Self {
+                vocab_r: vocab
+                    .iter()
+                    .map(|(tok, id)| (*id, tok.to_owned()))
+                    .collect(),
+                vocab,
+            }
+        }
+    }
+    #[typetag::serde]
+    impl Model for ModelMock {
+        fn tokenize(&self, _tokens: Vec<(String, Offsets)>) -> Result<Vec<Token>> {
+            unimplemented!()
+        }
+        fn token_to_id(&self, token: &str) -> Option<u32> {
+            self.vocab.get(token).copied()
+        }
+        fn id_to_token(&self, id: u32) -> Option<&str> {
+            self.vocab_r.get(&id).map(String::as_ref)
+        }
+        fn get_vocab(&self) -> &HashMap<String, u32> {
+            &self.vocab
+        }
+        fn get_vocab_size(&self) -> usize {
+            self.vocab.len()
+        }
+        fn save(&self, _folder: &Path, _name: Option<&str>) -> Result<Vec<PathBuf>> {
+            unimplemented!()
+        }
+    }
+
+    #[test]
+    fn can_add_tokens() {
+        let model = ModelMock::new(&[("test", 0), ("tost", 1)]);
+        let mut vocab = AddedVocabulary::new();
+
+        // Add tokens normally
+        assert_eq!(
+            vocab.add_tokens(&[AddedToken::from("added_token_1", false)], &model, None),
+            1
+        );
+        assert_eq!(vocab.len(), 1);
+
+        // Does not add multiple time the same token
+        assert_eq!(
+            vocab.add_tokens(
+                &[
+                    AddedToken::from("added_token_2", false),
+                    AddedToken::from("added_token_2", false)
+                ],
+                &model,
+                None
+            ),
+            1
+        );
+        assert_eq!(vocab.len(), 2);
+
+        // Does not add tokens already covered by the model
+        assert_eq!(
+            vocab.add_tokens(&[AddedToken::from("test", false)], &model, None),
+            0
+        );
+        assert_eq!(vocab.len(), 2);
+    }
+
+    #[test]
+    fn can_add_special_tokens() {
+        let model = ModelMock::new(&[("test", 0), ("tost", 1)]);
+        let mut vocab = AddedVocabulary::new();
+
+        // Add tokens normally
+        assert_eq!(
+            vocab.add_special_tokens(&[AddedToken::from("added_token_1", true)], &model, None),
+            1
+        );
+        assert_eq!(vocab.len(), 1);
+
+        // Does not add multiple time the same token
+        assert_eq!(
+            vocab.add_special_tokens(
+                &[
+                    AddedToken::from("added_token_2", true),
+                    AddedToken::from("added_token_2", true)
+                ],
+                &model,
+                None
+            ),
+            1
+        );
+        assert_eq!(vocab.len(), 2);
+
+        // Can add tokens already covered by the model
+        assert_eq!(
+            vocab.add_special_tokens(&[AddedToken::from("test", true)], &model, None),
+            0
+        );
+        assert_eq!(vocab.len(), 2); // Did not add a new token, since it exist in the original model
+        assert_eq!(vocab.is_special_token("test"), true);
+        assert_eq!(vocab.added_tokens_map.contains_key("test"), false);
+    }
+
+    #[test]
+    fn can_extract_added_tokens() {
+        // Is able to extract both normal and special tokens
+        let model = ModelMock::new(&[]);
+        let mut vocab = AddedVocabulary::new();
+
+        vocab.add_tokens(
+            &[
+                AddedToken::from("my", false),
+                AddedToken::from("name", false),
+            ],
+            &model,
+            None,
+        );
+        vocab.add_special_tokens(
+            &[
+                AddedToken::from("[CLS]", true),
+                AddedToken::from("[SEP]", true),
+            ],
+            &model,
+            None,
+        );
+
+        let result = vocab.extract_and_normalize(None, "[CLS] My name is Anthony [SEP]");
+        assert_eq!(
+            result
+                .iter()
+                .map(|(normalized, id)| (normalized.get(), *id))
+                .collect::<Vec<_>>(),
+            vec![
+                ("[CLS]", Some(2)),
+                (" My ", None),
+                ("name", Some(1)),
+                (" is Anthony ", None),
+                ("[SEP]", Some(3))
+            ]
+        );
+    }
+
+    #[test]
+    fn options_use_cases() {
+        // Is able to extract both normal and special tokens, with various options (lstrip, rstrip,
+        // single_word, normalized)
+        let model = ModelMock::new(&[]);
+        let normalizer = Lowercase;
+        let mut vocab = AddedVocabulary::new();
+
+        vocab.add_tokens(
+            &[
+                AddedToken::from("my", false).lstrip(true).rstrip(true),
+                AddedToken::from("name", false),
+                AddedToken::from("ony", false).single_word(true),
+            ],
+            &model,
+            Some(&normalizer),
+        );
+        vocab.add_special_tokens(
+            &[
+                AddedToken::from("[CLS]", true),
+                AddedToken::from("[SEP]", true),
+            ],
+            &model,
+            Some(&normalizer),
+        );
+
+        let result =
+            vocab.extract_and_normalize(Some(&normalizer), "[CLS] My name is Anthony [SEP]");
+        assert_eq!(
+            result
+                .iter()
+                .map(|(normalized, id)| (normalized.get(), *id))
+                .collect::<Vec<_>>(),
+            vec![
+                ("[CLS]", Some(3)),
+                // This one includes both spaces because of the lstrip & rstrip
+                // And it matches because normalized == true
+                (" my ", Some(0)),
+                ("name", Some(1)),
+                // `ony` is not extracted here thanks to single_word
+                (" is anthony ", None),
+                ("[SEP]", Some(4))
+            ]
+        );
+    }
+}
diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs
index fc90c121..be98a821 100644
--- a/tokenizers/src/tokenizer/mod.rs
+++ b/tokenizers/src/tokenizer/mod.rs
@@ -211,7 +211,6 @@ impl std::str::FromStr for Tokenizer {
 impl Tokenizer {
     /// Instantiate a new Tokenizer, with the given Model
     pub fn new(model: Box<dyn Model>) -> Self {
-        let original_vocab_size = model.get_vocab_size();
         Tokenizer {
             normalizer: None,
             pre_tokenizer: None,
@@ -219,7 +218,7 @@ impl Tokenizer {
             post_processor: None,
             decoder: None,
 
-            added_vocabulary: AddedVocabulary::new(original_vocab_size),
+            added_vocabulary: AddedVocabulary::new(),
 
             truncation: None,
             padding: None,
@@ -303,8 +302,6 @@ impl Tokenizer {
     /// Set the model
     pub fn with_model(&mut self, model: Box<dyn Model>) -> &Self {
         self.model = model;
-        self.added_vocabulary
-            .update_original_vocab_size(self.model.get_vocab_size());
         self
     }
 
@@ -669,8 +666,6 @@ impl Tokenizer {
 
         let (model, special_tokens) = trainer.train(words)?;
         self.model = model;
-        self.added_vocabulary
-            .update_original_vocab_size(self.model.get_vocab_size());
         self.add_special_tokens(&special_tokens);
 
         Ok(())
diff --git a/tokenizers/src/tokenizer/normalizer.rs b/tokenizers/src/tokenizer/normalizer.rs
index 01969cc9..08866970 100644
--- a/tokenizers/src/tokenizer/normalizer.rs
+++ b/tokenizers/src/tokenizer/normalizer.rs
@@ -50,7 +50,7 @@ where
 /// It is possible to retrieve a part of the original string, by indexing it with offsets from the
 /// normalized one, and the other way around too. It is also possible to convert offsets from one
 /// referential to the other one easily.
-#[derive(Default, Debug, Clone)]
+#[derive(Default, Debug, Clone, PartialEq)]
 pub struct NormalizedString {
     /// The original version of the string, before any modification
     original: String,
@@ -61,12 +61,6 @@ pub struct NormalizedString {
     alignments: Vec<(usize, usize)>,
 }
 
-impl std::cmp::PartialEq for NormalizedString {
-    fn eq(&self, other: &NormalizedString) -> bool {
-        self.normalized == other.normalized
-    }
-}
-
 impl NormalizedString {
     /// Create a NormalizedString from the given str
     pub fn from(s: &str) -> Self {
@@ -441,7 +435,7 @@ impl NormalizedString {
     /// Merge with the given NormalizedString by appending it to self
     pub fn merge_with(&mut self, other: &NormalizedString) {
         self.original.push_str(&other.original);
-        let len = self.len();
+        let len = self.len() - 1;
         self.alignments.extend(
             other
                 .alignments
@@ -879,7 +873,7 @@ mod tests {
             Some(NormalizedString {
                 original: "𝕞𝕠𝕣𝕟𝕚𝕟𝕘".to_string(),
                 normalized: "morning".to_string(),
-                alignments: vec![(5, 6), (6, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 12)]
+                alignments: vec![(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7)]
             })
         );
         assert_eq!(
diff --git a/tokenizers/tests/added_tokens.rs b/tokenizers/tests/added_tokens.rs
index e1f69c1e..e2153f0a 100644
--- a/tokenizers/tests/added_tokens.rs
+++ b/tokenizers/tests/added_tokens.rs
@@ -9,8 +9,8 @@ fn add_tokens() {
 
     assert_eq!(
         tokenizer.add_special_tokens(&[
-            AddedToken::from("<cls>".into(), true),
-            AddedToken::from("<sep>".into(), true)
+            AddedToken::from("<cls>", true),
+            AddedToken::from("<sep>", true)
         ]),
         2
     );
@@ -19,8 +19,8 @@ fn add_tokens() {
 
     assert_eq!(
         tokenizer.add_tokens(&[
-            AddedToken::from("hello".into(), false),
-            AddedToken::from("world".into(), false)
+            AddedToken::from("hello", false),
+            AddedToken::from("world", false)
         ]),
         2
     );
@@ -31,7 +31,7 @@ fn add_tokens() {
 #[test]
 fn lstrip_tokens() {
     let mut tokenizer = get_byte_level(true, false);
-    tokenizer.add_special_tokens(&[AddedToken::from("<mask>".into(), true).lstrip(true)]);
+    tokenizer.add_special_tokens(&[AddedToken::from("<mask>", true).lstrip(true)]);
 
     let input = "I saw a <mask> 😺";
     let output = tokenizer.encode(input, false).unwrap();
@@ -49,7 +49,7 @@ fn lstrip_tokens() {
 #[test]
 fn rstrip_tokens() {
     let mut tokenizer = get_byte_level(false, false);
-    tokenizer.add_special_tokens(&[AddedToken::from("<mask>".into(), true).rstrip(true)]);
+    tokenizer.add_special_tokens(&[AddedToken::from("<mask>", true).rstrip(true)]);
 
     let input = "I saw a <mask> 😺";
     let output = tokenizer.encode(input, false).unwrap();
@@ -62,7 +62,7 @@ fn rstrip_tokens() {
     // When `add_prefix_space = true` rstrip cannot work as a prefix space is added
     // to the next token
     let mut tokenizer = get_byte_level(true, false);
-    tokenizer.add_special_tokens(&[AddedToken::from("<mask>".into(), true).rstrip(true)]);
+    tokenizer.add_special_tokens(&[AddedToken::from("<mask>", true).rstrip(true)]);
 
     let input = "I saw a <mask> 😺";
     let output = tokenizer.encode(input, false).unwrap();
@@ -77,7 +77,7 @@ fn rstrip_tokens() {
 fn single_word_tokens() {
     // If `single_word = true` it shouldn't split `dancing`
     let mut tokenizer = get_byte_level(false, false);
-    tokenizer.add_special_tokens(&[AddedToken::from("ing".into(), true).single_word(true)]);
+    tokenizer.add_special_tokens(&[AddedToken::from("ing", true).single_word(true)]);
 
     let input = "I like dancing";
     let output = tokenizer.encode(input, false).unwrap();
@@ -86,7 +86,7 @@ fn single_word_tokens() {
 
     // If `single_word = false` it should split `dancing`
     let mut tokenizer = get_byte_level(false, false);
-    tokenizer.add_special_tokens(&[AddedToken::from("ing".into(), true).single_word(false)]);
+    tokenizer.add_special_tokens(&[AddedToken::from("ing", true).single_word(false)]);
 
     let input = "I like dancing";
     let output = tokenizer.encode(input, false).unwrap();
@@ -98,9 +98,9 @@ fn single_word_tokens() {
 fn overlapping_tokens() {
     let mut tokenizer = get_byte_level(false, false);
 
-    tokenizer.add_special_tokens(&[AddedToken::from("danc".into(), true)]);
-    tokenizer.add_special_tokens(&[AddedToken::from("nci".into(), true)]);
-    tokenizer.add_special_tokens(&[AddedToken::from("ing".into(), true)]);
+    tokenizer.add_special_tokens(&[AddedToken::from("danc", true)]);
+    tokenizer.add_special_tokens(&[AddedToken::from("nci", true)]);
+    tokenizer.add_special_tokens(&[AddedToken::from("ing", true)]);
 
     let input = "I like dancing";
     let output = tokenizer.encode(input, false).unwrap();
@@ -109,10 +109,10 @@ fn overlapping_tokens() {
 
     let mut tokenizer = get_byte_level(false, false);
 
-    tokenizer.add_special_tokens(&[AddedToken::from("nci".into(), true)]);
-    tokenizer.add_special_tokens(&[AddedToken::from("danc".into(), true)]);
-    tokenizer.add_special_tokens(&[AddedToken::from("ing".into(), true)]);
-    tokenizer.add_special_tokens(&[AddedToken::from("ike".into(), true)]);
+    tokenizer.add_special_tokens(&[AddedToken::from("nci", true)]);
+    tokenizer.add_special_tokens(&[AddedToken::from("danc", true)]);
+    tokenizer.add_special_tokens(&[AddedToken::from("ing", true)]);
+    tokenizer.add_special_tokens(&[AddedToken::from("ike", true)]);
 
     let output = tokenizer.encode(input, false).unwrap();
 
diff --git a/tokenizers/tests/offsets.rs b/tokenizers/tests/offsets.rs
index 29074bc1..88ce4118 100644
--- a/tokenizers/tests/offsets.rs
+++ b/tokenizers/tests/offsets.rs
@@ -158,7 +158,7 @@ fn split_on_added_tokens_bert() {
     let input = "Yesterday I saw a [MASK] far away";
 
     let mut tokenizer = get_bert();
-    tokenizer.add_special_tokens(&[AddedToken::from("[MASK]".into(), true)]);
+    tokenizer.add_special_tokens(&[AddedToken::from("[MASK]", true)]);
     let output = tokenizer.encode(input, false).unwrap();
 
     assert_eq!(