mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Python - Improved example with custom components
This commit is contained in:
47
bindings/python/examples/custom_components.py
Normal file
47
bindings/python/examples/custom_components.py
Normal file
@ -0,0 +1,47 @@
|
||||
import jieba
|
||||
|
||||
from tokenizers import Tokenizer, Regex
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.pre_tokenizers import PreTokenizer
|
||||
from tokenizers.normalizers import Normalizer
|
||||
from tokenizers.decoders import Decoder
|
||||
|
||||
|
||||
class JiebaPreTokenizer:
|
||||
def jieba_split(self, i, normalized):
|
||||
return [normalized[w[1] : w[2]] for w in jieba.tokenize(str(normalized))]
|
||||
|
||||
def pre_tokenize(self, pretok):
|
||||
# Let's call split on the PreTokenizedString to split using `self.split`
|
||||
# Here we can call `pretok.split` multiple times if we want to apply
|
||||
# different algorithm
|
||||
pretok.split(self.jieba_split)
|
||||
|
||||
|
||||
class CustomDecoder:
|
||||
def decode(self, tokens):
|
||||
return "".join(tokens)
|
||||
|
||||
|
||||
class CustomNormalizer:
|
||||
def normalize(self, normalized):
|
||||
normalized.nfkc()
|
||||
normalized.replace(Regex("\s+"), " ")
|
||||
normalized.lowercase()
|
||||
|
||||
|
||||
# This section shows how to attach these custom components to the Tokenizer
|
||||
tok = Tokenizer(BPE())
|
||||
tok.normalizer = Normalizer.custom(CustomNormalizer())
|
||||
tok.pre_tokenizer = PreTokenizer.custom(JiebaPreTokenizer())
|
||||
tok.decoder = Decoder.custom(CustomDecoder())
|
||||
|
||||
input1 = "永和服装饰品有限公司"
|
||||
print("PreTokenize:", input1)
|
||||
print(tok.pre_tokenizer.pre_tokenize_str(input1))
|
||||
# [('永和', (0, 2)), ('服装', (2, 4)), ('饰品', (4, 6)), ('有限公司', (6, 10))]
|
||||
|
||||
input2 = "ℌ𝔢𝔩𝔩𝔬 𝔱𝔥𝔢𝔯𝔢 𝓂𝓎 𝒹ℯ𝒶𝓇 𝕕𝕖𝕒𝕣 𝕗𝕣𝕚𝕖𝕟𝕕!"
|
||||
print("Normalize:", input2)
|
||||
print(tok.normalizer.normalize_str(input2))
|
||||
# hello there my dear dear friend!
|
@ -1,73 +0,0 @@
|
||||
import argparse
|
||||
|
||||
from tokenizers import Tokenizer, models, pre_tokenizers, decoders
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--vocab", default=None, type=str, required=True, help="The vocab.json file")
|
||||
parser.add_argument("--merges", default=None, type=str, required=True, help="The merges.txt file")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
class GoodCustom:
|
||||
"""GoodCustom
|
||||
This class represents a good custom PreTokenizer that will be called
|
||||
by `tokenizers` when needed
|
||||
"""
|
||||
|
||||
def pre_tokenize(self, sentence):
|
||||
return sentence.split(" ")
|
||||
|
||||
def decode(self, tokens):
|
||||
return ", ".join(tokens)
|
||||
|
||||
|
||||
class BadCustom:
|
||||
"""Bad Pretok
|
||||
This class represents a bad custom PreTokenizer that will trigger an exception
|
||||
when called by `tokenizers`
|
||||
"""
|
||||
|
||||
def pre_tokenize(self, sentence):
|
||||
return None
|
||||
|
||||
def decode(self, tokens):
|
||||
return None
|
||||
|
||||
|
||||
def tokenize(sentence):
|
||||
output = tokenizer.encode(sentence).tokens
|
||||
print(f"`{sentence}` tokenized to {output}")
|
||||
return output
|
||||
|
||||
|
||||
# Create a Tokenizer using a BPE model
|
||||
bpe = models.BPE(args.vocab, args.merges)
|
||||
tokenizer = Tokenizer(bpe)
|
||||
|
||||
# Test the good custom classes
|
||||
good_custom = GoodCustom()
|
||||
good_pretok = pre_tokenizers.PreTokenizer.custom(good_custom)
|
||||
good_decoder = decoders.Decoder.custom(good_custom)
|
||||
|
||||
tokenizer.pre_tokenizer = good_pretok
|
||||
tokenizer.decoder = good_decoder
|
||||
|
||||
print("Tokenization will work with good custom:")
|
||||
encoding = tokenizer.encode("Hey friend!")
|
||||
print(f"IDS: {encoding.ids}")
|
||||
print(f"TOKENS: {encoding.tokens}")
|
||||
print(f"OFFSETS: {encoding.offsets}")
|
||||
decoded = tokenizer.decode(encoding.ids)
|
||||
print(f"DECODED: {decoded}")
|
||||
|
||||
# Now test with the bad custom classes
|
||||
bad_custom = BadCustom()
|
||||
bad_pretok = pre_tokenizers.PreTokenizer.custom(bad_custom)
|
||||
bad_decoder = decoders.Decoder.custom(bad_custom)
|
||||
|
||||
tokenizer.pre_tokenizer = bad_pretok
|
||||
tokenizer.decoder = bad_decoder
|
||||
try:
|
||||
encoding = tokenizer.encode("Hey friend!")
|
||||
except:
|
||||
print("Bad tokenizer didn't work")
|
@ -13,7 +13,7 @@ use tk::decoders::DecoderWrapper;
|
||||
use tk::Decoder;
|
||||
use tokenizers as tk;
|
||||
|
||||
use super::error::{PyError, ToPyResult};
|
||||
use super::error::ToPyResult;
|
||||
|
||||
#[pyclass(dict, module = "tokenizers.decoders", name=Decoder)]
|
||||
#[derive(Clone, Deserialize, Serialize)]
|
||||
|
@ -7,6 +7,7 @@ use tokenizers::tokenizer::Result;
|
||||
#[derive(Debug)]
|
||||
pub struct PyError(pub String);
|
||||
impl PyError {
|
||||
#[allow(dead_code)]
|
||||
pub fn from(s: &str) -> Self {
|
||||
PyError(String::from(s))
|
||||
}
|
||||
|
@ -381,7 +381,7 @@ impl PyNormalizedStringRefMut {
|
||||
fn lowercase(&mut self) -> PyResult<()> {
|
||||
self.inner
|
||||
.map_mut(|n| {
|
||||
n.nfkc();
|
||||
n.lowercase();
|
||||
})
|
||||
.ok_or_else(PyNormalizedStringRefMut::destroyed_error)?;
|
||||
Ok(())
|
||||
|
Reference in New Issue
Block a user