Python - Improved example with custom components

This commit is contained in:
Anthony MOI
2020-09-21 14:30:52 -04:00
committed by Anthony MOI
parent 0a930ef1d8
commit b1097a988f
5 changed files with 50 additions and 75 deletions

View File

@ -0,0 +1,47 @@
import jieba
from tokenizers import Tokenizer, Regex
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import PreTokenizer
from tokenizers.normalizers import Normalizer
from tokenizers.decoders import Decoder
class JiebaPreTokenizer:
def jieba_split(self, i, normalized):
return [normalized[w[1] : w[2]] for w in jieba.tokenize(str(normalized))]
def pre_tokenize(self, pretok):
# Let's call split on the PreTokenizedString to split using `self.split`
# Here we can call `pretok.split` multiple times if we want to apply
# different algorithm
pretok.split(self.jieba_split)
class CustomDecoder:
def decode(self, tokens):
return "".join(tokens)
class CustomNormalizer:
def normalize(self, normalized):
normalized.nfkc()
normalized.replace(Regex("\s+"), " ")
normalized.lowercase()
# This section shows how to attach these custom components to the Tokenizer
tok = Tokenizer(BPE())
tok.normalizer = Normalizer.custom(CustomNormalizer())
tok.pre_tokenizer = PreTokenizer.custom(JiebaPreTokenizer())
tok.decoder = Decoder.custom(CustomDecoder())
input1 = "永和服装饰品有限公司"
print("PreTokenize:", input1)
print(tok.pre_tokenizer.pre_tokenize_str(input1))
# [('永和', (0, 2)), ('服装', (2, 4)), ('饰品', (4, 6)), ('有限公司', (6, 10))]
input2 = "𝔢𝔩𝔩𝔬 𝔱𝔥𝔢𝔯𝔢 𝓂𝓎 𝒹𝒶𝓇 𝕕𝕖𝕒𝕣 𝕗𝕣𝕚𝕖𝕟𝕕!"
print("Normalize:", input2)
print(tok.normalizer.normalize_str(input2))
# hello there my dear dear friend!

View File

@ -1,73 +0,0 @@
import argparse
from tokenizers import Tokenizer, models, pre_tokenizers, decoders
parser = argparse.ArgumentParser()
parser.add_argument("--vocab", default=None, type=str, required=True, help="The vocab.json file")
parser.add_argument("--merges", default=None, type=str, required=True, help="The merges.txt file")
args = parser.parse_args()
class GoodCustom:
"""GoodCustom
This class represents a good custom PreTokenizer that will be called
by `tokenizers` when needed
"""
def pre_tokenize(self, sentence):
return sentence.split(" ")
def decode(self, tokens):
return ", ".join(tokens)
class BadCustom:
"""Bad Pretok
This class represents a bad custom PreTokenizer that will trigger an exception
when called by `tokenizers`
"""
def pre_tokenize(self, sentence):
return None
def decode(self, tokens):
return None
def tokenize(sentence):
output = tokenizer.encode(sentence).tokens
print(f"`{sentence}` tokenized to {output}")
return output
# Create a Tokenizer using a BPE model
bpe = models.BPE(args.vocab, args.merges)
tokenizer = Tokenizer(bpe)
# Test the good custom classes
good_custom = GoodCustom()
good_pretok = pre_tokenizers.PreTokenizer.custom(good_custom)
good_decoder = decoders.Decoder.custom(good_custom)
tokenizer.pre_tokenizer = good_pretok
tokenizer.decoder = good_decoder
print("Tokenization will work with good custom:")
encoding = tokenizer.encode("Hey friend!")
print(f"IDS: {encoding.ids}")
print(f"TOKENS: {encoding.tokens}")
print(f"OFFSETS: {encoding.offsets}")
decoded = tokenizer.decode(encoding.ids)
print(f"DECODED: {decoded}")
# Now test with the bad custom classes
bad_custom = BadCustom()
bad_pretok = pre_tokenizers.PreTokenizer.custom(bad_custom)
bad_decoder = decoders.Decoder.custom(bad_custom)
tokenizer.pre_tokenizer = bad_pretok
tokenizer.decoder = bad_decoder
try:
encoding = tokenizer.encode("Hey friend!")
except:
print("Bad tokenizer didn't work")

View File

@ -13,7 +13,7 @@ use tk::decoders::DecoderWrapper;
use tk::Decoder;
use tokenizers as tk;
use super::error::{PyError, ToPyResult};
use super::error::ToPyResult;
#[pyclass(dict, module = "tokenizers.decoders", name=Decoder)]
#[derive(Clone, Deserialize, Serialize)]

View File

@ -7,6 +7,7 @@ use tokenizers::tokenizer::Result;
#[derive(Debug)]
pub struct PyError(pub String);
impl PyError {
#[allow(dead_code)]
pub fn from(s: &str) -> Self {
PyError(String::from(s))
}

View File

@ -381,7 +381,7 @@ impl PyNormalizedStringRefMut {
fn lowercase(&mut self) -> PyResult<()> {
self.inner
.map_mut(|n| {
n.nfkc();
n.lowercase();
})
.ok_or_else(PyNormalizedStringRefMut::destroyed_error)?;
Ok(())