diff --git a/bindings/python/examples/custom_pre_tokenizer.py b/bindings/python/examples/custom_pre_tokenizer.py index 60c4309b..fdd88665 100644 --- a/bindings/python/examples/custom_pre_tokenizer.py +++ b/bindings/python/examples/custom_pre_tokenizer.py @@ -1,49 +1,69 @@ import argparse -from tokenizers import Tokenizer, models, pre_tokenizers +from tokenizers import Tokenizer, models, pre_tokenizers, decoders parser = argparse.ArgumentParser() parser.add_argument("--vocab", default=None, type=str, required=True, help="The vocab.json file") parser.add_argument("--merges", default=None, type=str, required=True, help="The merges.txt file") args = parser.parse_args() -class MyPreTok: - """ - This class represents a custom PreTokenizer that will be called +class GoodCustom: + """GoodCustom + This class represents a good custom PreTokenizer that will be called by `tokenizers` when needed """ def pre_tokenize(self, sentence): - if sentence.startswith("Hello"): - # This will generate an error - return None + return sentence.split(" ") - # Prepend "Haha" - return sum([ [ "Haha" ], sentence.split(" ") ], []) + def decode(self, tokens): + return ", ".join(tokens) +class BadCustom: + """Bad Pretok + This class represents a bad custom PreTokenizer that will trigger an exception + when called by `tokenizers` + """ + def pre_tokenize(self, sentence): + return None -# Create a PreTokenizer from our custom one -mypretok = MyPreTok() -pretok = pre_tokenizers.PreTokenizer.custom(mypretok) - -# Create a Tokenizer using a BPE model -bpe = models.BPE.from_files(args.vocab, args.merges) -tokenizer = Tokenizer(bpe) - -# And attach our PreTokenizer -tokenizer.with_pre_tokenizer(pretok) - + def decode(self, tokens): + return None def tokenize(sentence): - output = [ token.value for token in tokenizer.encode(sentence) ] + output = tokenizer.encode(sentence).tokens print(f"`{sentence}` tokenized to {output}") return output -## Good example -# Our PreTokenizer has been used as expected -assert(tokenize("Hey friend") == [ "H", "aha", "Hey", "friend" ]) +# Create a Tokenizer using a BPE model +bpe = models.BPE.from_files(args.vocab, args.merges) +tokenizer = Tokenizer(models.BPE.from_files(args.vocab, args.merges)) + +# Test the good custom classes +good_custom = GoodCustom() +good_pretok = pre_tokenizers.PreTokenizer.custom(good_custom) +good_decoder = decoders.Decoder.custom(good_custom) + +tokenizer.with_pre_tokenizer(good_pretok) +tokenizer.with_decoder(good_decoder) + +print("Tokenization will work with good custom:") +encoding = tokenizer.encode("Hey friend!") +print(f"IDS: {encoding.ids}") +print(f"TOKENS: {encoding.tokens}") +print(f"OFFSETS: {encoding.offsets}") +decoded = tokenizer.decode(encoding.ids) +print(f"DECODED: {decoded}") + +# Now test with the bad custom classes +bad_custom = BadCustom() +bad_pretok = pre_tokenizers.PreTokenizer.custom(bad_custom) +bad_decoder = decoders.Decoder.custom(bad_custom) + +tokenizer.with_pre_tokenizer(bad_pretok) +tokenizer.with_decoder(bad_decoder) +try: + encoding = tokenizer.encode("Hey friend!") +except: + print("Bad tokenizer didn't work") -## Bad example -# In this case, our PreTokenizer returns None instead of a List[str] -# So it doesn't work as expected, and we get a empty list back, with an error printed -assert(tokenize("Hello friend") == []) diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs index cb504a16..3c8c8bbf 100644 --- a/bindings/python/src/decoders.rs +++ b/bindings/python/src/decoders.rs @@ -72,10 +72,10 @@ impl tk::tokenizer::Decoder for PyDecoder { .to_string() .map_err(|_| PyError::from("`decode` is expected to return a str"))? .into_owned()), - Err(e) => Err(Box::new(PyError(format!( - "Error while calling `decode`: {:?}", - e - )))), + Err(e) => { + e.print(py); + Err(Box::new(PyError::from("Error while calling `decode`"))) + } } } } diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 046a9277..350ba5e6 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -82,10 +82,12 @@ impl tk::tokenizer::PreTokenizer for PyPreTokenizer { .map_err(|_| PyError::from("`pre_tokenize is expected to return a List[str]"))? .extract::>() .map_err(|_| PyError::from("`pre_tokenize` is expected to return a List[str]"))?), - Err(e) => Err(Box::new(PyError(format!( - "Error while calling `pre_tokenize`: {:?}", - e - )))), + Err(e) => { + e.print(py); + Err(Box::new(PyError::from( + "Error while calling `pre_tokenize`", + ))) + } } } }