mirror of
https://github.com/mii443/tokenizers.git
synced 2025-09-01 23:09:34 +00:00
Python - Improved example with custom components
This commit is contained in:
47
bindings/python/examples/custom_components.py
Normal file
47
bindings/python/examples/custom_components.py
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
import jieba
|
||||||
|
|
||||||
|
from tokenizers import Tokenizer, Regex
|
||||||
|
from tokenizers.models import BPE
|
||||||
|
from tokenizers.pre_tokenizers import PreTokenizer
|
||||||
|
from tokenizers.normalizers import Normalizer
|
||||||
|
from tokenizers.decoders import Decoder
|
||||||
|
|
||||||
|
|
||||||
|
class JiebaPreTokenizer:
|
||||||
|
def jieba_split(self, i, normalized):
|
||||||
|
return [normalized[w[1] : w[2]] for w in jieba.tokenize(str(normalized))]
|
||||||
|
|
||||||
|
def pre_tokenize(self, pretok):
|
||||||
|
# Let's call split on the PreTokenizedString to split using `self.split`
|
||||||
|
# Here we can call `pretok.split` multiple times if we want to apply
|
||||||
|
# different algorithm
|
||||||
|
pretok.split(self.jieba_split)
|
||||||
|
|
||||||
|
|
||||||
|
class CustomDecoder:
|
||||||
|
def decode(self, tokens):
|
||||||
|
return "".join(tokens)
|
||||||
|
|
||||||
|
|
||||||
|
class CustomNormalizer:
|
||||||
|
def normalize(self, normalized):
|
||||||
|
normalized.nfkc()
|
||||||
|
normalized.replace(Regex("\s+"), " ")
|
||||||
|
normalized.lowercase()
|
||||||
|
|
||||||
|
|
||||||
|
# This section shows how to attach these custom components to the Tokenizer
|
||||||
|
tok = Tokenizer(BPE())
|
||||||
|
tok.normalizer = Normalizer.custom(CustomNormalizer())
|
||||||
|
tok.pre_tokenizer = PreTokenizer.custom(JiebaPreTokenizer())
|
||||||
|
tok.decoder = Decoder.custom(CustomDecoder())
|
||||||
|
|
||||||
|
input1 = "永和服装饰品有限公司"
|
||||||
|
print("PreTokenize:", input1)
|
||||||
|
print(tok.pre_tokenizer.pre_tokenize_str(input1))
|
||||||
|
# [('永和', (0, 2)), ('服装', (2, 4)), ('饰品', (4, 6)), ('有限公司', (6, 10))]
|
||||||
|
|
||||||
|
input2 = "ℌ𝔢𝔩𝔩𝔬 𝔱𝔥𝔢𝔯𝔢 𝓂𝓎 𝒹ℯ𝒶𝓇 𝕕𝕖𝕒𝕣 𝕗𝕣𝕚𝕖𝕟𝕕!"
|
||||||
|
print("Normalize:", input2)
|
||||||
|
print(tok.normalizer.normalize_str(input2))
|
||||||
|
# hello there my dear dear friend!
|
@ -1,73 +0,0 @@
|
|||||||
import argparse
|
|
||||||
|
|
||||||
from tokenizers import Tokenizer, models, pre_tokenizers, decoders
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("--vocab", default=None, type=str, required=True, help="The vocab.json file")
|
|
||||||
parser.add_argument("--merges", default=None, type=str, required=True, help="The merges.txt file")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
|
|
||||||
class GoodCustom:
|
|
||||||
"""GoodCustom
|
|
||||||
This class represents a good custom PreTokenizer that will be called
|
|
||||||
by `tokenizers` when needed
|
|
||||||
"""
|
|
||||||
|
|
||||||
def pre_tokenize(self, sentence):
|
|
||||||
return sentence.split(" ")
|
|
||||||
|
|
||||||
def decode(self, tokens):
|
|
||||||
return ", ".join(tokens)
|
|
||||||
|
|
||||||
|
|
||||||
class BadCustom:
|
|
||||||
"""Bad Pretok
|
|
||||||
This class represents a bad custom PreTokenizer that will trigger an exception
|
|
||||||
when called by `tokenizers`
|
|
||||||
"""
|
|
||||||
|
|
||||||
def pre_tokenize(self, sentence):
|
|
||||||
return None
|
|
||||||
|
|
||||||
def decode(self, tokens):
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def tokenize(sentence):
|
|
||||||
output = tokenizer.encode(sentence).tokens
|
|
||||||
print(f"`{sentence}` tokenized to {output}")
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
# Create a Tokenizer using a BPE model
|
|
||||||
bpe = models.BPE(args.vocab, args.merges)
|
|
||||||
tokenizer = Tokenizer(bpe)
|
|
||||||
|
|
||||||
# Test the good custom classes
|
|
||||||
good_custom = GoodCustom()
|
|
||||||
good_pretok = pre_tokenizers.PreTokenizer.custom(good_custom)
|
|
||||||
good_decoder = decoders.Decoder.custom(good_custom)
|
|
||||||
|
|
||||||
tokenizer.pre_tokenizer = good_pretok
|
|
||||||
tokenizer.decoder = good_decoder
|
|
||||||
|
|
||||||
print("Tokenization will work with good custom:")
|
|
||||||
encoding = tokenizer.encode("Hey friend!")
|
|
||||||
print(f"IDS: {encoding.ids}")
|
|
||||||
print(f"TOKENS: {encoding.tokens}")
|
|
||||||
print(f"OFFSETS: {encoding.offsets}")
|
|
||||||
decoded = tokenizer.decode(encoding.ids)
|
|
||||||
print(f"DECODED: {decoded}")
|
|
||||||
|
|
||||||
# Now test with the bad custom classes
|
|
||||||
bad_custom = BadCustom()
|
|
||||||
bad_pretok = pre_tokenizers.PreTokenizer.custom(bad_custom)
|
|
||||||
bad_decoder = decoders.Decoder.custom(bad_custom)
|
|
||||||
|
|
||||||
tokenizer.pre_tokenizer = bad_pretok
|
|
||||||
tokenizer.decoder = bad_decoder
|
|
||||||
try:
|
|
||||||
encoding = tokenizer.encode("Hey friend!")
|
|
||||||
except:
|
|
||||||
print("Bad tokenizer didn't work")
|
|
@ -13,7 +13,7 @@ use tk::decoders::DecoderWrapper;
|
|||||||
use tk::Decoder;
|
use tk::Decoder;
|
||||||
use tokenizers as tk;
|
use tokenizers as tk;
|
||||||
|
|
||||||
use super::error::{PyError, ToPyResult};
|
use super::error::ToPyResult;
|
||||||
|
|
||||||
#[pyclass(dict, module = "tokenizers.decoders", name=Decoder)]
|
#[pyclass(dict, module = "tokenizers.decoders", name=Decoder)]
|
||||||
#[derive(Clone, Deserialize, Serialize)]
|
#[derive(Clone, Deserialize, Serialize)]
|
||||||
|
@ -7,6 +7,7 @@ use tokenizers::tokenizer::Result;
|
|||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct PyError(pub String);
|
pub struct PyError(pub String);
|
||||||
impl PyError {
|
impl PyError {
|
||||||
|
#[allow(dead_code)]
|
||||||
pub fn from(s: &str) -> Self {
|
pub fn from(s: &str) -> Self {
|
||||||
PyError(String::from(s))
|
PyError(String::from(s))
|
||||||
}
|
}
|
||||||
|
@ -381,7 +381,7 @@ impl PyNormalizedStringRefMut {
|
|||||||
fn lowercase(&mut self) -> PyResult<()> {
|
fn lowercase(&mut self) -> PyResult<()> {
|
||||||
self.inner
|
self.inner
|
||||||
.map_mut(|n| {
|
.map_mut(|n| {
|
||||||
n.nfkc();
|
n.lowercase();
|
||||||
})
|
})
|
||||||
.ok_or_else(PyNormalizedStringRefMut::destroyed_error)?;
|
.ok_or_else(PyNormalizedStringRefMut::destroyed_error)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
|
Reference in New Issue
Block a user