Adressing first pass of comments.

2025-12-03 11:18:29 +00:00 · 2020-09-23 11:29:17 +02:00
parent 1cd4824273
commit 8f8156fd2c
10 changed files with 196 additions and 125 deletions
--- a/bindings/python/examples/example.py
+++ b/bindings/python/examples/example.py
@@ -70,13 +70,17 @@ elif args.type == "bert":

    tok_r = Tokenizer(WordPiece(args.vocab, unk_token="[UNK]", max_input_chars_per_word=100))
    tok_r.normalizer = BertNormalizer(
-        clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True,
+        clean_text=True,
+        handle_chinese_chars=True,
+        strip_accents=True,
+        lowercase=True,
    )
    # tok_r.pre_tokenizer = pre_tokenizers.Whitespace()
    tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
    tok_r.decoder = decoders.WordPiece()
    tok_r.post_processor = BertProcessing(
-        ("[SEP]", tok_r.token_to_id("[SEP]")), ("[CLS]", tok_r.token_to_id("[CLS]")),
+        ("[SEP]", tok_r.token_to_id("[SEP]")),
+        ("[CLS]", tok_r.token_to_id("[CLS]")),
    )
 else:
    raise Exception(f"Unknown type {args.type}")
--- a/bindings/python/examples/train_bert_wordpiece.py
+++ b/bindings/python/examples/train_bert_wordpiece.py
@@ -32,7 +32,10 @@ if not files:

 # Initialize an empty tokenizer
 tokenizer = BertWordPieceTokenizer(
-    clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True,
+    clean_text=True,
+    handle_chinese_chars=True,
+    strip_accents=True,
+    lowercase=True,
 )

 # And then train
--- a/bindings/python/scripts/convert.py
+++ b/bindings/python/scripts/convert.py
@@ -76,43 +76,28 @@ class SpmConverter(Converter):
        model_type = proto.trainer_spec.model_type
        vocab = self.vocab(proto)
        unk_id = self.unk_id(proto)
-        filename = self.original_tokenizer.vocab_file
-
        if model_type == 1:
-            data = {"unk_id": unk_id, "vocab": vocab}
-
-            out_vocab_filename = f"{filename}.json"
-            try:
-                with open(out_vocab_filename, "w") as f:
-                    json.dump(data, f, indent=4)
-
-                tokenizer = Tokenizer(Unigram(out_vocab_filename))
-            finally:
-                os.remove(out_vocab_filename)
+            tokenizer = Tokenizer(Unigram(vocab, unk_id))
        elif model_type == 2:
-            vocab, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract()
+            vocab, merges = SentencePieceExtractor(
+                self.original_tokenizer.vocab_file
+            ).extract()
            # Open output files and let's extract model information
-            out_vocab_filename = f"{filename}.vocab"
-            out_merge_filename = f"{filename}.merge"
-            try:
-                with open(out_vocab_filename, "w") as vocab_f:
-                    json.dump(vocab, vocab_f)
-                try:
-                    with open(out_merge_filename, "w") as merges_f:
-                        # Save content
-                        merges_f.writelines(map(lambda x: f"{x[0]} {x[1]}{os.linesep}", merges))
-                    tokenizer = Tokenizer(
-                        BPE(
-                            out_vocab_filename,
-                            out_merge_filename,
-                            unk_token=proto.trainer_spec.unk_piece,
-                            fuse_unk=True,
-                        )
-                    )
-                finally:
-                    os.remove(out_merge_filename)
-            finally:
-                os.remove(out_vocab_filename)
+            actual_merges = {}
+            for id_merge, (a, b) in enumerate(merges):
+                id_a = vocab[a]
+                id_b = vocab[b]
+                id_ab = vocab[a + b]
+                id_ab = vocab[a + b]
+                actual_merges[(id_a, id_b)] = (id_merge, id_ab)
+            tokenizer = Tokenizer(
+                BPE(
+                    vocab,
+                    actual_merges,
+                    unk_token=proto.trainer_spec.unk_piece,
+                    fuse_unk=True,
+                )
+            )
        else:
            raise Exception(
                "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
@@ -346,7 +331,9 @@ class PegasusConverter(SpmConverter):
        return TemplateProcessing(
            seq_a=["$0", eos],
            seq_b=["$1", eos],
-            special_tokens=[(eos, tokenizer.get_vocab()[eos]),],
+            special_tokens=[
+                (eos, tokenizer.get_vocab()[eos]),
+            ],
        )


@@ -355,7 +342,9 @@ class T5Converter(SpmConverter):
        return TemplateProcessing(
            seq_a=["$0", "</s>"],
            seq_b=["$1", "</s>"],
-            special_tokens=[("</s>", tokenizer.get_vocab()["</s>"]),],
+            special_tokens=[
+                ("</s>", tokenizer.get_vocab()["</s>"]),
+            ],
        )


@@ -447,7 +436,9 @@ def main():
    model_len = 50
    status_len = 6
    speedup_len = 8
-    print(f"|{'Model':^{model_len}}|{'Status':^{status_len}}|{'Speedup':^{speedup_len}}|")
+    print(
+        f"|{'Model':^{model_len}}|{'Status':^{status_len}}|{'Speedup':^{speedup_len}}|"
+    )
    print(f"|{'-'*model_len}|{'-'*status_len}|{'-'*speedup_len}|")
    for pretrained in args.models:
        status, speedup = check(pretrained, args.filename)
--- a/bindings/python/scripts/spm_parity_check.py
+++ b/bindings/python/scripts/spm_parity_check.py
@@ -17,7 +17,11 @@ except Exception:
 def main():
    parser = ArgumentParser("SentencePiece parity checker")
    parser.add_argument(
-        "--input-file", "-i", type=str, required=True, help="Which files do you want to train from",
+        "--input-file",
+        "-i",
+        type=str,
+        required=True,
+        help="Which files do you want to train from",
    )
    parser.add_argument(
        "--model-file",
@@ -28,13 +32,22 @@ def main():
        help="Use a pretrained token file",
    )
    parser.add_argument(
-        "--model-prefix", type=str, default="spm_parity", help="Model prefix for spm_train",
+        "--model-prefix",
+        type=str,
+        default="spm_parity",
+        help="Model prefix for spm_train",
    )
    parser.add_argument(
-        "--vocab-size", "-v", type=int, default=8000, help="Vocab size for spm_train",
+        "--vocab-size",
+        "-v",
+        type=int,
+        default=8000,
+        help="Vocab size for spm_train",
    )
    parser.add_argument(
-        "--verbose", action="store_true", help="Verbosity",
+        "--verbose",
+        action="store_true",
+        help="Verbosity",
    )
    parser.add_argument(
        "--train",
@@ -160,10 +173,14 @@ def check_details(line, spm_ids, tok_ids, sp, tok):
        spms = Counter(spm_ids[first:last])
        toks = Counter(tok_ids[first:last])

-        removable_tokens = {spm_ for (spm_, si) in spms.items() if toks.get(spm_, 0) == si}
+        removable_tokens = {
+            spm_ for (spm_, si) in spms.items() if toks.get(spm_, 0) == si
+        }
        min_width = 3
        for i in range(last - first - min_width):
-            if all(spm_ids[first + i + j] in removable_tokens for j in range(min_width)):
+            if all(
+                spm_ids[first + i + j] in removable_tokens for j in range(min_width)
+            ):
                possible_matches = [
                    k
                    for k in range(last - first - min_width)
@@ -174,7 +191,11 @@ def check_details(line, spm_ids, tok_ids, sp, tok):
                    if check_diff(
                        spm_ids[first : first + i], tok_ids[first : first + j], sp, tok
                    ) and check_details(
-                        line, spm_ids[first + i : last], tok_ids[first + j : last], sp, tok,
+                        line,
+                        spm_ids[first + i : last],
+                        tok_ids[first + j : last],
+                        sp,
+                        tok,
                    ):
                        return True

@@ -189,7 +210,9 @@ def check_details(line, spm_ids, tok_ids, sp, tok):
    wrong = tok.decode(spm_ids[first:last])
    print()
    if has_color:
-        print(f"{colored(ok_start, 'grey')}{colored(wrong, 'red')}{colored(ok_end, 'grey')}")
+        print(
+            f"{colored(ok_start, 'grey')}{colored(wrong, 'red')}{colored(ok_end, 'grey')}"
+        )
    else:
        print(wrong)
    return False
@@ -203,17 +226,8 @@ def check_encode(args):
        tok = tokenizers.SentencePieceUnigramTokenizer.from_spm(args.model_file)
    else:
        vocab = [(sp.id_to_piece(i), sp.get_score(i)) for i in range(sp.piece_size())]
-        vocab_filename = f"{args.model_file}.json"
        unk_id = sp.unk_id()
-
-        data = {"unk_id": unk_id, "vocab": vocab}
-        try:
-            with open(vocab_filename, "w") as f:
-                json.dump(data, f, indent=4)
-
-            tok = tokenizers.SentencePieceUnigramTokenizer(vocab_filename)
-        finally:
-            os.remove(vocab_filename)
+        tok = tokenizers.SentencePieceUnigramTokenizer(vocab, unk_id)

    perfect = 0
    imperfect = 0
@@ -255,7 +269,9 @@ def check_encode(args):

    print(f"({perfect} / {imperfect} / {wrong} ----- {perfect + imperfect + wrong})")
    total = perfect + imperfect + wrong
-    print(f"Accuracy {perfect * 100 / total:.2f} Slowdown : {tok_total_time/ spm_total_time:.2f}")
+    print(
+        f"Accuracy {perfect * 100 / total:.2f} Slowdown : {tok_total_time/ spm_total_time:.2f}"
+    )


 if __name__ == "__main__":
--- a/bindings/python/src/error.rs
+++ b/bindings/python/src/error.rs
@@ -34,3 +34,11 @@ impl<T> ToPyResult<T> {
        self.into()
    }
 }
+
+pub(crate) fn deprecation_warning(version: &str, message: &str) -> PyResult<()> {
+    let gil = pyo3::Python::acquire_gil();
+    let python = gil.python();
+    let deprecation_warning = python.import("builtins")?.get("DeprecationWarning")?;
+    let full_message = format!("Deprecated in {}: {}", version, message);
+    pyo3::PyErr::warn(python, deprecation_warning, &full_message, 0)
+}
--- a/bindings/python/src/models.rs
+++ b/bindings/python/src/models.rs
@@ -15,16 +15,7 @@ use tk::models::ModelWrapper;
 use tk::{Model, Token};
 use tokenizers as tk;

-use super::error::ToPyResult;
-
-fn deprecation_warning(version: &str, message: &str) -> PyResult<()> {
-    let gil = pyo3::Python::acquire_gil();
-    let python = gil.python();
-    let deprecation_warning = python.import("builtins")?.get("DeprecationWarning")?;
-    let full_message = format!("Deprecated in {}: {}", version, message);
-    pyo3::PyErr::warn(python, deprecation_warning, &full_message, 0)?;
-    Ok(())
-}
+use super::error::{deprecation_warning, ToPyResult};

 /// A Model represents some tokenization algorithm like BPE or Word
 /// This class cannot be constructed directly. Please use one of the concrete models.
@@ -183,13 +174,24 @@ impl PyBPE {
    }
 }

+#[derive(FromPyObject)]
+enum PyVocab<'a> {
+    Vocab(Vocab),
+    Filename(&'a str),
+}
+#[derive(FromPyObject)]
+enum PyMerges<'a> {
+    Merges(Merges),
+    Filename(&'a str),
+}
+
 #[pymethods]
 impl PyBPE {
    #[new]
    #[args(kwargs = "**")]
    fn new(
-        vocab: Option<&PyAny>,
-        merges: Option<&PyAny>,
+        vocab: Option<PyVocab>,
+        merges: Option<PyMerges>,
        kwargs: Option<&PyDict>,
    ) -> PyResult<(Self, PyModel)> {
        if (vocab.is_some() && merges.is_none()) || (vocab.is_none() && merges.is_some()) {
@@ -199,17 +201,24 @@ impl PyBPE {
        }

        let mut builder = BPE::builder();
-        if let (Some(vocab_any), Some(merges_any)) = (vocab, merges) {
-            if let (Ok(vocab), Ok(merges)) = (vocab_any.extract(), merges_any.extract()) {
-                builder = builder.vocab_and_merges(vocab, merges);
-            } else {
-                let vocab_filename: String = vocab_any.extract()?;
-                let merges_filename: String = merges_any.extract()?;
-                deprecation_warning(
+        if let (Some(vocab), Some(merges)) = (vocab, merges) {
+            match (vocab, merges) {
+                (PyVocab::Vocab(vocab), PyMerges::Merges(merges)) => {
+                    builder = builder.vocab_and_merges(vocab, merges);
+                }
+                (PyVocab::Filename(vocab_filename), PyMerges::Filename(merges_filename)) => {
+                    deprecation_warning(
                    "0.9.0",
                    "BPE.__init__ will not create from files anymore, try `BPE.from_files` instead",
                )?;
-                builder = builder.files(vocab_filename, merges_filename);
+                    builder =
+                        builder.files(vocab_filename.to_string(), merges_filename.to_string());
+                }
+                _ => {
+                    return Err(exceptions::PyValueError::new_err(
+                        "`vocab` and `merges` must be both be from memory or both filenames",
+                    ));
+                }
            }
        }

@@ -268,20 +277,21 @@ impl PyWordPiece {
 impl PyWordPiece {
    #[new]
    #[args(kwargs = "**")]
-    fn new(vocab: Option<&PyAny>, kwargs: Option<&PyDict>) -> PyResult<(Self, PyModel)> {
+    fn new(vocab: Option<PyVocab>, kwargs: Option<&PyDict>) -> PyResult<(Self, PyModel)> {
        let mut builder = WordPiece::builder();

-        if let Some(vocab_any) = vocab {
-            #[allow(deprecated)]
-            if let Ok(vocab) = vocab_any.extract() {
-                builder = builder.vocab(vocab);
-            } else {
-                deprecation_warning(
-                    "0.9.0",
-                    "WordPiece.__init__ will not create from files anymore, try `WordPiece.from_file` instead",
-                )?;
-                let vocab_filename: String = vocab_any.extract()?;
-                builder = builder.files(vocab_filename);
+        if let Some(vocab) = vocab {
+            match vocab {
+                PyVocab::Vocab(vocab) => {
+                    builder = builder.vocab(vocab);
+                }
+                PyVocab::Filename(vocab_filename) => {
+                    deprecation_warning(
+                        "0.9.0",
+                        "WordPiece.__init__ will not create from files anymore, try `WordPiece.from_file` instead",
+                    )?;
+                    builder = builder.files(vocab_filename.to_string());
+                }
            }
        }

@@ -320,27 +330,27 @@ impl PyWordLevel {
 impl PyWordLevel {
    #[new]
    #[args(kwargs = "**")]
-    fn new(vocab: Option<&PyAny>, kwargs: Option<&PyDict>) -> PyResult<(Self, PyModel)> {
+    fn new(vocab: Option<PyVocab>, kwargs: Option<&PyDict>) -> PyResult<(Self, PyModel)> {
        let unk_token = PyWordLevel::get_unk(kwargs)?;

-        if let Some(vocab_object) = vocab {
-            let model = if let Ok(vocab) = vocab_object.extract() {
-                WordLevel::builder()
+        if let Some(vocab) = vocab {
+            let model = match vocab {
+                PyVocab::Vocab(vocab) => WordLevel::builder()
                    .vocab(vocab)
                    .unk_token(unk_token)
-                    .build()
-            } else {
-                let filename: &str = vocab_object.extract()?;
-                deprecation_warning(
+                    .build(),
+                PyVocab::Filename(vocab_filename) => {
+                    deprecation_warning(
                    "0.9.0",
                    "WordLevel.__init__ will not create from files anymore, try `WordLevel.from_file` instead",
                )?;
-                WordLevel::from_files(filename, unk_token).map_err(|e| {
-                    exceptions::PyException::new_err(format!(
-                        "Error while loading WordLevel: {}",
-                        e
-                    ))
-                })?
+                    WordLevel::from_files(vocab_filename, unk_token).map_err(|e| {
+                        exceptions::PyException::new_err(format!(
+                            "Error while loading WordLevel: {}",
+                            e
+                        ))
+                    })?
+                }
            };

            Ok((PyWordLevel {}, PyModel::new(Arc::new(model.into()))))
--- a/bindings/python/tests/bindings/test_models.py
+++ b/bindings/python/tests/bindings/test_models.py
@@ -14,22 +14,33 @@ class TestBPE:
        vocab = {"a": 0, "b": 1, "ab": 2}
        merges = {(0, 1): (0, 2)}
        assert isinstance(BPE(vocab, merges), Model)
-        with pytest.raises(ValueError, match="`vocab` and `merges` must be both specified"):
+        with pytest.raises(
+            ValueError, match="`vocab` and `merges` must be both specified"
+        ):
            BPE(vocab=vocab)
            BPE(merges=merges)

-        assert isinstance(pickle.loads(pickle.dumps(BPE(vocab, merges))), BPE,)
+        assert isinstance(
+            pickle.loads(pickle.dumps(BPE(vocab, merges))),
+            BPE,
+        )

        # Deprecated calls in 0.9
        with pytest.deprecated_call():
-            assert isinstance(BPE(roberta_files["vocab"], roberta_files["merges"]), Model)
+            assert isinstance(
+                BPE(roberta_files["vocab"], roberta_files["merges"]), Model
+            )

-        with pytest.raises(ValueError, match="`vocab` and `merges` must be both specified"):
+        with pytest.raises(
+            ValueError, match="`vocab` and `merges` must be both specified"
+        ):
            BPE(vocab=roberta_files["vocab"])
            BPE(merges=roberta_files["merges"])
        with pytest.deprecated_call():
            assert isinstance(
-                pickle.loads(pickle.dumps(BPE(roberta_files["vocab"], roberta_files["merges"]))),
+                pickle.loads(
+                    pickle.dumps(BPE(roberta_files["vocab"], roberta_files["merges"]))
+                ),
                BPE,
            )

@@ -48,7 +59,9 @@ class TestWordPiece:
        with pytest.deprecated_call():
            assert isinstance(WordPiece(bert_files["vocab"]), Model)
        with pytest.deprecated_call():
-            assert isinstance(pickle.loads(pickle.dumps(WordPiece(bert_files["vocab"]))), WordPiece)
+            assert isinstance(
+                pickle.loads(pickle.dumps(WordPiece(bert_files["vocab"]))), WordPiece
+            )


 class TestWordLevel:
--- a/bindings/python/tests/bindings/test_tokenizer.py
+++ b/bindings/python/tests/bindings/test_tokenizer.py
@@ -144,7 +144,9 @@ class TestTokenizer:
        assert output.tokens == ["my", "name", "is", "john"]

        # Can encode a batch with both a single sequence and a pair of sequences
-        output = tokenizer.encode_batch(["my name is john", ("my name is john", "pair")])
+        output = tokenizer.encode_batch(
+            ["my name is john", ("my name is john", "pair")]
+        )
        assert len(output) == 2

    def test_encode_formats(self, bert_files):
@@ -167,7 +169,9 @@ class TestTokenizer:
        ]
        output = tokenizer.encode(["my", "name", "is", "john"], is_pretokenized=True)
        assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"]
-        output = tokenizer.encode(["my", "name", "is", "john"], ["pair"], is_pretokenized=True)
+        output = tokenizer.encode(
+            ["my", "name", "is", "john"], ["pair"], is_pretokenized=True
+        )
        assert output.tokens == [
            "[CLS]",
            "my",
@@ -213,13 +217,19 @@ class TestTokenizer:

        # Numpy
        test_single(np.array(["My name is John", "My name is Georges"]))
-        test_pair(np.array([("My name is John", "pair"), ("My name is Georges", "pair")]))
-        test_pair(np.array([["My name is John", "pair"], ["My name is Georges", "pair"]]))
+        test_pair(
+            np.array([("My name is John", "pair"), ("My name is Georges", "pair")])
+        )
+        test_pair(
+            np.array([["My name is John", "pair"], ["My name is Georges", "pair"]])
+        )

        # PreTokenized inputs

        # Lists
-        test_single([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]], True)
+        test_single(
+            [["My", "name", "is", "John"], ["My", "name", "is", "Georges"]], True
+        )
        test_pair(
            [
                (["My", "name", "is", "John"], ["pair"]),
@@ -236,7 +246,9 @@ class TestTokenizer:
        )

        # Tuples
-        test_single((("My", "name", "is", "John"), ("My", "name", "is", "Georges")), True)
+        test_single(
+            (("My", "name", "is", "John"), ("My", "name", "is", "Georges")), True
+        )
        test_pair(
            (
                (("My", "name", "is", "John"), ("pair",)),
@@ -254,10 +266,12 @@ class TestTokenizer:

        # Numpy
        test_single(
-            np.array([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]]), True,
+            np.array([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]]),
+            True,
        )
        test_single(
-            np.array((("My", "name", "is", "John"), ("My", "name", "is", "Georges"))), True,
+            np.array((("My", "name", "is", "John"), ("My", "name", "is", "Georges"))),
+            True,
        )
        test_pair(
            np.array(
@@ -298,11 +312,14 @@ class TestTokenizer:

        tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True)
        tokenizer.post_processor = RobertaProcessing(
-            ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")),
+            ("</s>", tokenizer.token_to_id("</s>")),
+            ("<s>", tokenizer.token_to_id("<s>")),
        )

        # Can encode with special tokens
-        output_with_specials = tokenizer.encode("My name is John", add_special_tokens=True)
+        output_with_specials = tokenizer.encode(
+            "My name is John", add_special_tokens=True
+        )
        assert output_with_specials.tokens == [
            "<s>",
            "ĠMy",
@@ -313,7 +330,9 @@ class TestTokenizer:
        ]

        # Can encode without special tokens
-        output_without_specials = tokenizer.encode("My name is John", add_special_tokens=False)
+        output_without_specials = tokenizer.encode(
+            "My name is John", add_special_tokens=False
+        )
        assert output_without_specials.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]

    def test_truncation(self):
--- a/bindings/python/tests/implementations/test_byte_level_bpe.py
+++ b/bindings/python/tests/implementations/test_byte_level_bpe.py
@@ -67,7 +67,10 @@ class TestByteLevelBPE:

    def test_lowerspace(self, roberta_files):
        tokenizer = ByteLevelBPETokenizer.from_files(
-            roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True,
+            roberta_files["vocab"],
+            roberta_files["merges"],
+            add_prefix_space=True,
+            lowercase=True,
        )
        output = tokenizer.encode("The Quick Brown Fox Jumps Over The Lazy Dog")

--- a/bindings/python/tests/implementations/test_char_bpe.py
+++ b/bindings/python/tests/implementations/test_char_bpe.py
@@ -6,7 +6,9 @@ from tokenizers import CharBPETokenizer

 class TestBertWordPieceBPE:
    def test_basic_encode(self, openai_files):
-        tokenizer = CharBPETokenizer.from_files(openai_files["vocab"], openai_files["merges"])
+        tokenizer = CharBPETokenizer.from_files(
+            openai_files["vocab"], openai_files["merges"]
+        )

        output = tokenizer.encode("My name is John", "pair")
        assert output.ids == [0, 253, 1362, 544, 0, 7, 12662, 2688]
@@ -50,6 +52,8 @@ class TestBertWordPieceBPE:
        assert decoded == "my name is john"

    def test_multiprocessing_with_parallelism(self, openai_files):
-        tokenizer = CharBPETokenizer.from_files(openai_files["vocab"], openai_files["merges"])
+        tokenizer = CharBPETokenizer.from_files(
+            openai_files["vocab"], openai_files["merges"]
+        )
        multiprocessing_with_parallelism(tokenizer, False)
        multiprocessing_with_parallelism(tokenizer, True)