Python - Update examples and improve errors

2025-08-22 16:25:30 +00:00 · 2019-12-13 14:37:29 -05:00
parent 7cf4b3a6cd
commit 3355be89cd
3 changed files with 58 additions and 36 deletions
--- a/bindings/python/examples/custom_pre_tokenizer.py
+++ b/bindings/python/examples/custom_pre_tokenizer.py
@ -1,49 +1,69 @@
 import argparse

-from tokenizers import Tokenizer, models, pre_tokenizers
+from tokenizers import Tokenizer, models, pre_tokenizers, decoders

 parser = argparse.ArgumentParser()
 parser.add_argument("--vocab", default=None, type=str, required=True, help="The vocab.json file")
 parser.add_argument("--merges", default=None, type=str, required=True, help="The merges.txt file")
 args = parser.parse_args()

-class MyPreTok:
-    """
-    This class represents a custom PreTokenizer that will be called
+class GoodCustom:
+    """GoodCustom
+    This class represents a good custom PreTokenizer that will be called
    by `tokenizers` when needed
    """
    def pre_tokenize(self, sentence):
-        if sentence.startswith("Hello"):
-            # This will generate an error
-            return None
+        return sentence.split(" ")

-        # Prepend "Haha"
-        return sum([ [ "Haha" ], sentence.split(" ") ], [])
+    def decode(self, tokens):
+        return ", ".join(tokens)

+class BadCustom:
+    """Bad Pretok
+    This class represents a bad custom PreTokenizer that will trigger an exception
+    when called by `tokenizers`
+    """
+    def pre_tokenize(self, sentence):
+        return None

-# Create a PreTokenizer from our custom one
-mypretok = MyPreTok()
-pretok = pre_tokenizers.PreTokenizer.custom(mypretok)
-
-# Create a Tokenizer using a BPE model
-bpe = models.BPE.from_files(args.vocab, args.merges)
-tokenizer = Tokenizer(bpe)
-
-# And attach our PreTokenizer
-tokenizer.with_pre_tokenizer(pretok)
-
+    def decode(self, tokens):
+        return None

 def tokenize(sentence):
-    output = [ token.value for token in tokenizer.encode(sentence) ]
+    output = tokenizer.encode(sentence).tokens
    print(f"`{sentence}` tokenized to {output}")
    return output


-## Good example
-# Our PreTokenizer has been used as expected
-assert(tokenize("Hey friend") == [ "H", "aha", "Hey", "friend" ])
+# Create a Tokenizer using a BPE model
+bpe = models.BPE.from_files(args.vocab, args.merges)
+tokenizer = Tokenizer(models.BPE.from_files(args.vocab, args.merges))
+
+# Test the good custom classes
+good_custom = GoodCustom()
+good_pretok = pre_tokenizers.PreTokenizer.custom(good_custom)
+good_decoder = decoders.Decoder.custom(good_custom)
+
+tokenizer.with_pre_tokenizer(good_pretok)
+tokenizer.with_decoder(good_decoder)
+
+print("Tokenization will work with good custom:")
+encoding = tokenizer.encode("Hey friend!")
+print(f"IDS: {encoding.ids}")
+print(f"TOKENS: {encoding.tokens}")
+print(f"OFFSETS: {encoding.offsets}")
+decoded = tokenizer.decode(encoding.ids)
+print(f"DECODED: {decoded}")
+
+# Now test with the bad custom classes
+bad_custom = BadCustom()
+bad_pretok = pre_tokenizers.PreTokenizer.custom(bad_custom)
+bad_decoder = decoders.Decoder.custom(bad_custom)
+
+tokenizer.with_pre_tokenizer(bad_pretok)
+tokenizer.with_decoder(bad_decoder)
+try:
+    encoding = tokenizer.encode("Hey friend!")
+except:
+    print("Bad tokenizer didn't work")

-## Bad example
-# In this case, our PreTokenizer returns None instead of a List[str]
-# So it doesn't work as expected, and we get a empty list back, with an error printed
-assert(tokenize("Hello friend") == [])
--- a/bindings/python/src/decoders.rs
+++ b/bindings/python/src/decoders.rs
@ -72,10 +72,10 @@ impl tk::tokenizer::Decoder for PyDecoder {
                .to_string()
                .map_err(|_| PyError::from("`decode` is expected to return a str"))?
                .into_owned()),
-            Err(e) => Err(Box::new(PyError(format!(
-                "Error while calling `decode`: {:?}",
-                e
-            )))),
+            Err(e) => {
+                e.print(py);
+                Err(Box::new(PyError::from("Error while calling `decode`")))
+            }
        }
    }
 }
--- a/bindings/python/src/pre_tokenizers.rs
+++ b/bindings/python/src/pre_tokenizers.rs
@ -82,10 +82,12 @@ impl tk::tokenizer::PreTokenizer for PyPreTokenizer {
                .map_err(|_| PyError::from("`pre_tokenize is expected to return a List[str]"))?
                .extract::<Vec<String>>()
                .map_err(|_| PyError::from("`pre_tokenize` is expected to return a List[str]"))?),
-            Err(e) => Err(Box::new(PyError(format!(
-                "Error while calling `pre_tokenize`: {:?}",
-                e
-            )))),
+            Err(e) => {
+                e.print(py);
+                Err(Box::new(PyError::from(
+                    "Error while calling `pre_tokenize`",
+                )))
+            }
        }
    }
 }