Add unigram bytefallback (#1217)

* current updates will go red * cargo fmt * npm install * refactor train for unigram to allow bytefallbakc (breaking) * fmt * nits * update * add a proper test * fix encode optimised fallback + add trainer arg * fixes * fixes * fix tests * add test * fmt * fix rust test * update python bindings * update * pub is okay and needed * more fix * cleanup * remove useles id * MissingUnkId error * nits * fix offset * add a test in python * update src bindings * remove bytefallback from trainer * styling * update pckg * lint * fmt * stup with dev * update code based on review * remove unused function * udpate python test to compare ids * fix option bool issues * final fix * clippy * fix npm isntall * update * update test * more in depth testing * Lint * last attempt to fix node * update node bindings * fmt * Update tokenizers/src/models/unigram/model.rs Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com> * update based on review * simpler test * lint --------- Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
2025-08-23 00:35:35 +00:00 · 2023-06-26 17:46:59 +09:00
parent 8c9cfb0b68
commit 864135bef1
12 changed files with 7276 additions and 6540 deletions
--- a/bindings/node/lib/bindings/models.d.ts
+++ b/bindings/node/lib/bindings/models.d.ts
@ -170,6 +170,11 @@ export interface UnigramOptions {
   * @default undefined
   */
  unkId?: number;
+  /**
+   * Whether or not bytefallback support should be enabled.
+   * @default false
+   */
+  byte_fallback?: boolean;
 }

 export namespace Unigram {
--- a/bindings/node/lib/bindings/models.test.ts
+++ b/bindings/node/lib/bindings/models.test.ts
@ -124,6 +124,7 @@ describe("Unigram", () => {
      ],
      {
        unkId: 0,
+        byte_fallback: false,
      }
    );
    expect(unigram.constructor.name).toEqual("Model");
--- a/bindings/node/native/src/models.rs
+++ b/bindings/node/native/src/models.rs
@ -191,6 +191,7 @@ fn bpe_init(mut cx: FunctionContext) -> JsResult<JsModel> {
 ///   unkToken?: string,
 ///   continuingSubwordPrefix?: string,
 ///   endOfWordSuffix?: string
+///   byteFallback?: bool
 /// }, callback)
 fn bpe_from_file(mut cx: FunctionContext) -> JsResult<JsUndefined> {
    let (options, callback) = match cx.extract_opt::<BpeOptions>(2) {
@ -369,16 +370,16 @@ fn wordlevel_empty(mut cx: FunctionContext) -> JsResult<JsModel> {
 #[serde(rename_all = "camelCase")]
 struct UnigramOptions {
    unk_id: Option<usize>,
+    byte_fallback: Option<bool>,
 }
-
 /// unigram_init(vocab: [string, number][], options?: {
 ///   unkId?: number
 /// })
 fn unigram_init(mut cx: FunctionContext) -> JsResult<JsModel> {
    let vocab = cx.extract::<Vec<(String, f64)>>(0)?;
    let options = cx.extract_opt::<UnigramOptions>(1)?.unwrap_or_default();
-
-    let unigram = tk::models::unigram::Unigram::from(vocab, options.unk_id)
+    let byte_fallback = options.byte_fallback.unwrap_or(false);
+    let unigram = tk::models::unigram::Unigram::from(vocab, options.unk_id, byte_fallback)
        .map_err(|e| Error(e.to_string()))?;

    let mut js_model = JsModel::new::<_, JsModel, _>(&mut cx, vec![])?;
--- a/bindings/node/package-lock.json
+++ b/bindings/node/package-lock.json
--- a/bindings/node/package.json
+++ b/bindings/node/package.json
@ -16,7 +16,9 @@
    "license": "Apache-2.0",
    "dependencies": {
        "@types/node": "^13.13.52",
-        "node-pre-gyp": "^0.14.0"
+        "native": "^0.3.3",
+        "node-pre-gyp": "^0.14.0",
+        "package.json": "^2.0.1"
    },
    "devDependencies": {
        "@types/jest": "^26.0.24",
--- a/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py
+++ b/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py
@ -162,6 +162,7 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
        vocab = [(piece.piece, piece.score) for piece in m.pieces]
        unk_id = m.trainer_spec.unk_id
        model_type = m.trainer_spec.model_type
+        byte_fallback = m.trainer_spec.byte_fallback
        if model_type != 1:
            raise Exception(
                "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
@ -170,7 +171,7 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
        replacement = "▁"
        add_prefix_space = True

-        tokenizer = Tokenizer(Unigram(vocab, unk_id))
+        tokenizer = Tokenizer(Unigram(vocab, unk_id, byte_fallback))

        tokenizer.normalizer = normalizers.Sequence(
            [
--- a/bindings/python/py_src/tokenizers/models/init.pyi
+++ b/bindings/python/py_src/tokenizers/models/init.pyi
@ -242,11 +242,11 @@ class Unigram(Model):
    An implementation of the Unigram algorithm

    Args:
-        vocab (:obj:`List[Tuple[str, float]]`, `optional`):
+        vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`):
            A list of vocabulary items and their relative score [("am", -0.2442),...]
    """

-    def __init__(self, vocab):
+    def __init__(self, vocab, unk_id, byte_fallback):
        pass
    def get_trainer(self):
        """
--- a/bindings/python/src/models.rs
+++ b/bindings/python/src/models.rs
@ -804,24 +804,32 @@ impl PyWordLevel {
 /// An implementation of the Unigram algorithm
 ///
 /// Args:
-///     vocab (:obj:`List[Tuple[str, float]]`, `optional`):
+///     vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`):
 ///         A list of vocabulary items and their relative score [("am", -0.2442),...]
 #[pyclass(extends=PyModel, module = "tokenizers.models", name = "Unigram")]
-#[pyo3(text_signature = "(self, vocab)")]
+#[pyo3(text_signature = "(self, vocab, unk_id, byte_fallback)")]
 pub struct PyUnigram {}

 #[pymethods]
 impl PyUnigram {
    #[new]
-    fn new(vocab: Option<Vec<(String, f64)>>, unk_id: Option<usize>) -> PyResult<(Self, PyModel)> {
-        match (vocab, unk_id) {
-            (Some(vocab), unk_id) => {
-                let model = Unigram::from(vocab, unk_id).map_err(|e| {
-                    exceptions::PyException::new_err(format!("Error while loading Unigram: {}", e))
-                })?;
+    fn new(
+        vocab: Option<Vec<(String, f64)>>,
+        unk_id: Option<usize>,
+        byte_fallback: Option<bool>,
+    ) -> PyResult<(Self, PyModel)> {
+        match (vocab, unk_id, byte_fallback) {
+            (Some(vocab), unk_id, byte_fallback) => {
+                let model =
+                    Unigram::from(vocab, unk_id, byte_fallback.unwrap_or(false)).map_err(|e| {
+                        exceptions::PyException::new_err(format!(
+                            "Error while loading Unigram: {}",
+                            e
+                        ))
+                    })?;
                Ok((PyUnigram {}, model.into()))
            }
-            (None, None) => Ok((PyUnigram {}, Unigram::default().into())),
+            (None, None, _) => Ok((PyUnigram {}, Unigram::default().into())),
            _ => Err(exceptions::PyValueError::new_err(
                "`vocab` and `unk_id` must be both specified",
            )),
--- a/bindings/python/tests/bindings/test_tokenizer.py
+++ b/bindings/python/tests/bindings/test_tokenizer.py
@ -5,7 +5,7 @@ import pytest

 from tokenizers import AddedToken, Encoding, Tokenizer
 from tokenizers.implementations import BertWordPieceTokenizer
-from tokenizers.models import BPE, Model, WordPiece
+from tokenizers.models import BPE, Model, WordPiece, Unigram
 from tokenizers.normalizers import Lowercase
 from tokenizers.pre_tokenizers import ByteLevel
 from tokenizers.processors import BertProcessing, RobertaProcessing
@ -412,3 +412,29 @@ class TestTokenizer:
        tokenizer = Tokenizer.from_pretrained("anthony/tokenizers-test", revision="gpt-2")
        output = tokenizer.encode("Hey there dear friend!", add_special_tokens=False)
        assert output.tokens == ["Hey", "Ġthere", "Ġdear", "Ġfriend", "!"]
+
+    def test_unigram_byte_fallback(self):
+        vocab = [
+            ("<unk>", 0.0),
+            ("A", -0.01),
+            ("sen", -0.02),
+            ("te", -0.03),
+            ("n", -0.04),
+            ("ce", -0.05),
+            ("<0xF0>", -0.06),
+            ("<0x9F>", -0.06),
+            ("<0xA4>", -0.06),
+            ("<0x97>", -0.06),
+            (" ", -0.4),
+        ]
+        tokenizer = tokenizer = Tokenizer(Unigram(vocab, 0, byte_fallback=False))
+
+        output = tokenizer.encode("A sentence 🤗")
+        assert output.ids == [1, 10, 2, 3, 4, 5, 10, 0]
+        assert output.tokens == ["A", " ", "sen", "te", "n", "ce", " ", "🤗"]
+
+        tokenizer = Tokenizer(Unigram(vocab, 0, byte_fallback=True))
+
+        output = tokenizer.encode("A sentence 🤗")
+        assert output.ids == [1, 10, 2, 3, 4, 5, 10, 6, 7, 8, 9]
+        assert output.tokens == ["A", " ", "sen", "te", "n", "ce", " ", "<0xF0>", "<0x9F>", "<0xA4>", "<0x97>"]
--- a/tokenizers/src/models/unigram/model.rs
+++ b/tokenizers/src/models/unigram/model.rs
@ -27,6 +27,7 @@ pub struct Unigram {

    fuse_unk: bool,
    is_optimized: bool,
+    byte_fallback: bool,
 }
 impl PartialEq for Unigram {
    fn eq(&self, other: &Self) -> bool {
@ -50,6 +51,7 @@ impl Clone for Unigram {
            eos_id: self.eos_id,
            fuse_unk: self.fuse_unk,
            is_optimized: self.is_optimized,
+            byte_fallback: self.byte_fallback,
        }
    }
 }
@ -59,6 +61,7 @@ impl std::fmt::Debug for Unigram {
        fmt.debug_struct("Unigram")
            .field("vocab", &self.vocab.len())
            .field("unk_id", &self.unk_id)
+            .field("byte_fallback", &self.byte_fallback)
            .finish()
    }
 }
@ -78,7 +81,7 @@ pub enum UnigramError {
 impl Default for Unigram {
    fn default() -> Self {
        let vocab = vec![("<unk>".to_string(), 0.0)];
-        Self::from(vocab, Some(0)).unwrap()
+        Self::from(vocab, Some(0), false).unwrap()
    }
 }

@ -89,7 +92,11 @@ impl Unigram {
    /// unk_id, is the index within the vocabulary.
    /// For now `Unigram` *requires* at least `unk` because we might find a never seen char.
    /// Further versions might allow that part to be hidden.
-    pub fn from(vocab: Vec<(String, f64)>, unk_id: Option<usize>) -> Result<Self> {
+    pub fn from(
+        vocab: Vec<(String, f64)>,
+        unk_id: Option<usize>,
+        byte_fallback: bool,
+    ) -> Result<Self> {
        let n = vocab.len();
        let mut token_to_ids: TokenMap = HashMap::new();
        let mut builder = TrieBuilder::default();
@ -102,7 +109,6 @@ impl Unigram {
                return Err(Box::new(UnigramError::UnkIdNotInVocabulary));
            }
        }
-
        let bos_id = n + 1;
        let eos_id = n + 2;

@ -130,6 +136,7 @@ impl Unigram {
            fuse_unk,
            cache: Cache::default(),
            is_optimized,
+            byte_fallback,
        })
    }

@ -143,7 +150,9 @@ impl Unigram {
    pub(super) fn set_optimized(&mut self, is_optimized: bool) {
        self.is_optimized = is_optimized;
    }
-
+    pub fn byte_fallback(&self) -> bool {
+        self.byte_fallback
+    }
    pub(super) fn len(&self) -> usize {
        self.vocab.len()
    }
@ -205,7 +214,7 @@ impl Unigram {
    ///     ("abc".to_string(), 5.0),
    ///     ("abcd".to_string(), 10.0),
    /// ];
-    /// let model = Unigram::from(pieces, Some(0)).unwrap();
+    /// let model = Unigram::from(pieces, Some(0), false).unwrap();
    /// let result = model.encode("abcdacdxx").unwrap();
    /// assert_eq!(result, vec!["abcd", "a", "cd", "xx"]);
    /// ```
@ -407,12 +416,31 @@ impl Model for Unigram {
        let mut offset = 0;
        let mut tokens = Vec::with_capacity(str_tokens.len());
        for string in str_tokens {
-            let id: u32 = match self.token_to_ids.get(&string) {
-                Some(id) => *id,
-                None => self.unk_id.ok_or(UnigramError::MissingUnkId)? as u32,
-            };
            let len = string.len();
            let offsets = (offset, offset + len);
+            let id: u32 = match self.token_to_ids.get(&string) {
+                Some(id) => *id,
+                None => {
+                    if self.byte_fallback {
+                        let byte_tokens: Option<Vec<_>> = string
+                            .bytes()
+                            .map(|byte| -> Option<Token> {
+                                let byte_string = format!("<0x{:02X}>", byte);
+                                let id = self.token_to_ids.get(&byte_string);
+                                id.map(|id| Token::new(*id, byte_string, (offset, offset + len)))
+                            })
+                            .collect();
+                        if let Some(byte_tokens) = byte_tokens {
+                            for token in byte_tokens {
+                                tokens.push(token);
+                            }
+                            offset += len;
+                            continue;
+                        }
+                    }
+                    self.unk_id.ok_or(UnigramError::MissingUnkId)? as u32
+                }
+            };
            offset += len;
            tokens.push(Token::new(id, string, offsets));
        }
@ -452,7 +480,7 @@ mod tests {
    #[test]
    fn test_populate_nodes_unk() {
        let pieces = vec![("<unk>".to_string(), 0.0)];
-        let model = Unigram::from(pieces, Some(0)).unwrap();
+        let model = Unigram::from(pieces, Some(0), false).unwrap();

        let mut lattice = Lattice::from("abc", model.bos_id, model.eos_id);
        model.populate_nodes(&mut lattice);
@ -477,7 +505,7 @@ mod tests {
            ("ab".to_string(), 0.3),
            ("bc".to_string(), 0.4),
        ];
-        let model = Unigram::from(pieces, Some(0)).unwrap();
+        let model = Unigram::from(pieces, Some(0), false).unwrap();

        let mut lattice = Lattice::from("abc", model.bos_id, model.eos_id);
        model.populate_nodes(&mut lattice);
@ -514,7 +542,7 @@ mod tests {
            ("abcd".to_string(), 10.0),
        ];

-        let model = Unigram::from(sentencepieces, Some(0)).unwrap();
+        let model = Unigram::from(sentencepieces, Some(0), false).unwrap();
        let result = model.encode("abcd").unwrap();
        assert_eq!(result, vec!["abcd"]);
    }
@ -536,7 +564,7 @@ mod tests {
            ("qr".to_string(), -0.5),
        ];

-        let mut model = Unigram::from(sentencepieces, Some(0)).unwrap();
+        let mut model = Unigram::from(sentencepieces, Some(0), false).unwrap();

        for is_optimized in &[true, false] {
            model.set_optimized(*is_optimized);
@ -573,4 +601,35 @@ mod tests {
            assert_eq!(model.encode("abqrcd").unwrap(), vec!["ab", "q", "r", "cd"]);
        }
    }
+
+    #[test]
+    fn test_unigram_bytefallback() {
+        // In [97]: processor.encode_as_pieces("⅐⅛⅑ ")
+        // Out[97]: ['▁', '<0xE2>', '<0x85>', '<0x90>', '⅛', '<0xE2>', '<0x85>', '<0x91>', '▁']
+        let sentencepieces = vec![
+            ("<unk>".to_string(), 0.0),
+            ("<0xC3>".to_string(), -0.01),
+            ("<0xA9>".to_string(), -0.03),
+        ];
+        let unigram = Unigram::from(sentencepieces, Some(0), true).unwrap();
+        let tokens: Vec<Token> = unigram.tokenize("é").unwrap();
+        assert_eq!(
+            tokens,
+            [
+                Token {
+                    id: 1,
+                    value: "<0xC3>".to_string(),
+                    offsets: (0, 2)
+                },
+                Token {
+                    id: 2,
+                    value: "<0xA9>".to_string(),
+                    offsets: (0, 2)
+                }
+            ]
+        );
+
+        let tokens = unigram.tokenize("?é").unwrap();
+        assert_eq!(tokens[0].id, 0);
+    }
 }
--- a/tokenizers/src/models/unigram/serialization.rs
+++ b/tokenizers/src/models/unigram/serialization.rs
@ -15,6 +15,7 @@ impl Serialize for Unigram {
        model.serialize_field("type", "Unigram")?;
        model.serialize_field("unk_id", &self.unk_id)?;
        model.serialize_field("vocab", &self.vocab)?;
+        model.serialize_field("byte_fallback", &self.byte_fallback())?;

        model.end()
    }
@ -25,7 +26,11 @@ impl<'de> Deserialize<'de> for Unigram {
    where
        D: Deserializer<'de>,
    {
-        deserializer.deserialize_struct("Unigram", &["type", "vocab", "unk_id"], UnigramVisitor)
+        deserializer.deserialize_struct(
+            "Unigram",
+            &["type", "vocab", "unk_id", "byte_fallback"],
+            UnigramVisitor,
+        )
    }
 }

@ -43,11 +48,13 @@ impl<'de> Visitor<'de> for UnigramVisitor {
    {
        let mut vocab: Option<Vec<(String, f64)>> = None;
        let mut unk_id: Option<usize> = None;
+        let mut byte_fallback: bool = false;
        while let Some(key) = map.next_key::<String>()? {
            match key.as_ref() {
                "unk_id" => {
                    unk_id = map.next_value()?;
                }
+                "byte_fallback" => byte_fallback = map.next_value()?,
                "vocab" => vocab = Some(map.next_value()?),
                "type" => match map.next_value()? {
                    "Unigram" => {}
@ -61,10 +68,10 @@ impl<'de> Visitor<'de> for UnigramVisitor {
                _ => (),
            }
        }
-        match (vocab, unk_id) {
-            (Some(vocab), unk_id) => Ok(Unigram::from(vocab, unk_id)
+        match (vocab, unk_id, byte_fallback) {
+            (Some(vocab), unk_id, byte_fallback) => Ok(Unigram::from(vocab, unk_id, byte_fallback)
                .map_err(|err| Error::custom(format!("Unable to load vocab {:?}", err)))?),
-            (None, _) => Err(Error::custom("Missing vocab")),
+            (None, _, _) => Err(Error::custom("Missing vocab")),
        }
    }
 }
@ -76,7 +83,7 @@ mod test {
    #[test]
    fn test_serialization() {
        let vocab = vec![("<unk>".to_string(), 0.0), ("a".to_string(), -0.5)];
-        let model = Unigram::from(vocab, Some(0)).unwrap();
+        let model = Unigram::from(vocab, Some(0), false).unwrap();

        let data = serde_json::to_string(&model).unwrap();
        let reconstructed = serde_json::from_str(&data).unwrap();
@ -87,7 +94,7 @@ mod test {
    #[test]
    fn test_serialization_unk_id_not_zero() {
        let vocab = vec![("a".to_string(), -0.5), ("<unk>".to_string(), 0.0)];
-        let model = Unigram::from(vocab, Some(1)).unwrap();
+        let model = Unigram::from(vocab, Some(1), false).unwrap();

        let data = serde_json::to_string(&model).unwrap();
        let reconstructed = serde_json::from_str(&data).unwrap();
@ -98,7 +105,7 @@ mod test {
    #[test]
    fn test_serialization_no_unk_id() {
        let vocab = vec![("a".to_string(), -0.5)];
-        let model = Unigram::from(vocab, None).unwrap();
+        let model = Unigram::from(vocab, None, false).unwrap();

        let data = serde_json::to_string(&model).unwrap();
        let reconstructed = serde_json::from_str(&data).unwrap();
--- a/tokenizers/src/models/unigram/trainer.rs
+++ b/tokenizers/src/models/unigram/trainer.rs
@ -177,7 +177,11 @@ impl UnigramTrainer {
            special_tokens.insert(0, (self.unk_token.clone().unwrap(), 0.0));
        }

-        Unigram::from(special_tokens.into_iter().chain(pieces).collect(), unk_id)
+        Unigram::from(
+            special_tokens.into_iter().chain(pieces).collect(),
+            unk_id,
+            model.byte_fallback(),
+        )
    }

    fn required_chars(&self, word_counts: &[Sentence]) -> HashSet<String> {
@ -563,7 +567,7 @@ impl UnigramTrainer {
        if required_chars.len() as u32 > self.vocab_size {
            return Err(Box::new(UnigramTrainerError::VocabularyTooSmall));
        }
-        let mut new_model = Unigram::from(pieces.clone(), Some(0))?;
+        let mut new_model = Unigram::from(pieces.clone(), Some(0), false)?;
        loop {
            // Sub-EM iteration.
            for _iter in 0..self.n_sub_iterations {
@ -572,7 +576,7 @@ impl UnigramTrainer {

                // Executes M step.
                pieces = self.run_m_step(&pieces, &expected);
-                new_model = Unigram::from(pieces.clone(), Some(0))?;
+                new_model = Unigram::from(pieces.clone(), Some(0), false)?;

                // Useful comment for checking compatibility with spm
                debug!(
@ -596,7 +600,7 @@ impl UnigramTrainer {

            // Prunes pieces.
            pieces = self.prune_sentence_pieces(&new_model, &pieces, &sentences);
-            new_model = Unigram::from(pieces.clone(), Some(0))?;
+            new_model = Unigram::from(pieces.clone(), Some(0), false)?;
        }
        self.finalize_progress(&progress, expected_updates);