Files
tokenizers/bindings/python/examples/custom_components.py
2020-09-23 15:50:01 -04:00

48 lines
1.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import jieba
from tokenizers import Tokenizer, Regex
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import PreTokenizer
from tokenizers.normalizers import Normalizer
from tokenizers.decoders import Decoder
class JiebaPreTokenizer:
def jieba_split(self, i, normalized):
return [normalized[w[1] : w[2]] for w in jieba.tokenize(str(normalized))]
def pre_tokenize(self, pretok):
# Let's call split on the PreTokenizedString to split using `self.split`
# Here we can call `pretok.split` multiple times if we want to apply
# different algorithm
pretok.split(self.jieba_split)
class CustomDecoder:
def decode(self, tokens):
return "".join(tokens)
class CustomNormalizer:
def normalize(self, normalized):
normalized.nfkc()
normalized.replace(Regex("\s+"), " ")
normalized.lowercase()
# This section shows how to attach these custom components to the Tokenizer
tok = Tokenizer(BPE())
tok.normalizer = Normalizer.custom(CustomNormalizer())
tok.pre_tokenizer = PreTokenizer.custom(JiebaPreTokenizer())
tok.decoder = Decoder.custom(CustomDecoder())
input1 = "永和服装饰品有限公司"
print("PreTokenize:", input1)
print(tok.pre_tokenizer.pre_tokenize_str(input1))
# [('永和', (0, 2)), ('服装', (2, 4)), ('饰品', (4, 6)), ('有限公司', (6, 10))]
input2 = "𝔢𝔩𝔩𝔬 𝔱𝔥𝔢𝔯𝔢 𝓂𝓎 𝒹𝒶𝓇 𝕕𝕖𝕒𝕣 𝕗𝕣𝕚𝕖𝕟𝕕!"
print("Normalize:", input2)
print(tok.normalizer.normalize_str(input2))
# hello there my dear dear friend!