Python - Improved example with custom components

2025-08-22 16:25:30 +00:00 · 2020-09-21 14:30:52 -04:00
parent 0a930ef1d8
commit b1097a988f
5 changed files with 50 additions and 75 deletions
--- a/bindings/python/examples/custom_components.py
+++ b/bindings/python/examples/custom_components.py
@ -0,0 +1,47 @@
+import jieba
+
+from tokenizers import Tokenizer, Regex
+from tokenizers.models import BPE
+from tokenizers.pre_tokenizers import PreTokenizer
+from tokenizers.normalizers import Normalizer
+from tokenizers.decoders import Decoder
+
+
+class JiebaPreTokenizer:
+    def jieba_split(self, i, normalized):
+        return [normalized[w[1] : w[2]] for w in jieba.tokenize(str(normalized))]
+
+    def pre_tokenize(self, pretok):
+        # Let's call split on the PreTokenizedString to split using `self.split`
+        # Here we can call `pretok.split` multiple times if we want to apply
+        # different algorithm
+        pretok.split(self.jieba_split)
+
+
+class CustomDecoder:
+    def decode(self, tokens):
+        return "".join(tokens)
+
+
+class CustomNormalizer:
+    def normalize(self, normalized):
+        normalized.nfkc()
+        normalized.replace(Regex("\s+"), " ")
+        normalized.lowercase()
+
+
+# This section shows how to attach these custom components to the Tokenizer
+tok = Tokenizer(BPE())
+tok.normalizer = Normalizer.custom(CustomNormalizer())
+tok.pre_tokenizer = PreTokenizer.custom(JiebaPreTokenizer())
+tok.decoder = Decoder.custom(CustomDecoder())
+
+input1 = "永和服装饰品有限公司"
+print("PreTokenize:", input1)
+print(tok.pre_tokenizer.pre_tokenize_str(input1))
+# [('永和', (0, 2)), ('服装', (2, 4)), ('饰品', (4, 6)), ('有限公司', (6, 10))]
+
+input2 = "ℌ𝔢𝔩𝔩𝔬    𝔱𝔥𝔢𝔯𝔢 𝓂𝓎 𝒹ℯ𝒶𝓇 𝕕𝕖𝕒𝕣    𝕗𝕣𝕚𝕖𝕟𝕕!"
+print("Normalize:", input2)
+print(tok.normalizer.normalize_str(input2))
+# hello there my dear dear friend!
--- a/bindings/python/examples/custom_pre_tokenizer.py
+++ b/bindings/python/examples/custom_pre_tokenizer.py
@ -1,73 +0,0 @@
-import argparse
-
-from tokenizers import Tokenizer, models, pre_tokenizers, decoders
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--vocab", default=None, type=str, required=True, help="The vocab.json file")
-parser.add_argument("--merges", default=None, type=str, required=True, help="The merges.txt file")
-args = parser.parse_args()
-
-
-class GoodCustom:
-    """GoodCustom
-    This class represents a good custom PreTokenizer that will be called
-    by `tokenizers` when needed
-    """
-
-    def pre_tokenize(self, sentence):
-        return sentence.split(" ")
-
-    def decode(self, tokens):
-        return ", ".join(tokens)
-
-
-class BadCustom:
-    """Bad Pretok
-    This class represents a bad custom PreTokenizer that will trigger an exception
-    when called by `tokenizers`
-    """
-
-    def pre_tokenize(self, sentence):
-        return None
-
-    def decode(self, tokens):
-        return None
-
-
-def tokenize(sentence):
-    output = tokenizer.encode(sentence).tokens
-    print(f"`{sentence}` tokenized to {output}")
-    return output
-
-
-# Create a Tokenizer using a BPE model
-bpe = models.BPE(args.vocab, args.merges)
-tokenizer = Tokenizer(bpe)
-
-# Test the good custom classes
-good_custom = GoodCustom()
-good_pretok = pre_tokenizers.PreTokenizer.custom(good_custom)
-good_decoder = decoders.Decoder.custom(good_custom)
-
-tokenizer.pre_tokenizer = good_pretok
-tokenizer.decoder = good_decoder
-
-print("Tokenization will work with good custom:")
-encoding = tokenizer.encode("Hey friend!")
-print(f"IDS: {encoding.ids}")
-print(f"TOKENS: {encoding.tokens}")
-print(f"OFFSETS: {encoding.offsets}")
-decoded = tokenizer.decode(encoding.ids)
-print(f"DECODED: {decoded}")
-
-# Now test with the bad custom classes
-bad_custom = BadCustom()
-bad_pretok = pre_tokenizers.PreTokenizer.custom(bad_custom)
-bad_decoder = decoders.Decoder.custom(bad_custom)
-
-tokenizer.pre_tokenizer = bad_pretok
-tokenizer.decoder = bad_decoder
-try:
-    encoding = tokenizer.encode("Hey friend!")
-except:
-    print("Bad tokenizer didn't work")
--- a/bindings/python/src/decoders.rs
+++ b/bindings/python/src/decoders.rs
@ -13,7 +13,7 @@ use tk::decoders::DecoderWrapper;
 use tk::Decoder;
 use tokenizers as tk;

-use super::error::{PyError, ToPyResult};
+use super::error::ToPyResult;

 #[pyclass(dict, module = "tokenizers.decoders", name=Decoder)]
 #[derive(Clone, Deserialize, Serialize)]
--- a/bindings/python/src/error.rs
+++ b/bindings/python/src/error.rs
@ -7,6 +7,7 @@ use tokenizers::tokenizer::Result;
 #[derive(Debug)]
 pub struct PyError(pub String);
 impl PyError {
+    #[allow(dead_code)]
    pub fn from(s: &str) -> Self {
        PyError(String::from(s))
    }
--- a/bindings/python/src/utils/normalization.rs
+++ b/bindings/python/src/utils/normalization.rs
@ -381,7 +381,7 @@ impl PyNormalizedStringRefMut {
    fn lowercase(&mut self) -> PyResult<()> {
        self.inner
            .map_mut(|n| {
-                n.nfkc();
+                n.lowercase();
            })
            .ok_or_else(PyNormalizedStringRefMut::destroyed_error)?;
        Ok(())