mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Some suggestions from @narsil
This commit is contained in:
@ -1,6 +1,8 @@
|
||||
import jieba
|
||||
|
||||
from tokenizers import Tokenizer, Regex
|
||||
from typing import List
|
||||
|
||||
from tokenizers import Tokenizer, Regex, NormalizedString, PreTokenizedString
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.pre_tokenizers import PreTokenizer
|
||||
from tokenizers.normalizers import Normalizer
|
||||
@ -8,24 +10,52 @@ from tokenizers.decoders import Decoder
|
||||
|
||||
|
||||
class JiebaPreTokenizer:
|
||||
def jieba_split(self, i, normalized):
|
||||
return [normalized[w[1] : w[2]] for w in jieba.tokenize(str(normalized))]
|
||||
def jieba_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
|
||||
splits = []
|
||||
# we need to call `str(normalized_string)` because jieba expects a str,
|
||||
# not a NormalizedString
|
||||
for token, start, stop in jieba.tokenize(str(normalized_string)):
|
||||
splits.append(normalized_string[start:stop])
|
||||
|
||||
def pre_tokenize(self, pretok):
|
||||
# Let's call split on the PreTokenizedString to split using `self.split`
|
||||
# Here we can call `pretok.split` multiple times if we want to apply
|
||||
# different algorithm
|
||||
return splits
|
||||
# We can also easily do it in one line:
|
||||
# return [normalized_string[w[1] : w[2]] for w in jieba.tokenize(str(normalized_string))]
|
||||
|
||||
def odd_number_split(
|
||||
self, i: int, normalized_string: NormalizedString
|
||||
) -> List[NormalizedString]:
|
||||
# Just an odd example...
|
||||
splits = []
|
||||
last = 0
|
||||
for (i, char) in enumerate(str(normalized_string)):
|
||||
if char.isnumeric() and int(char) % 2 == 1:
|
||||
splits.append(normalized_string[last:i])
|
||||
last = i
|
||||
# Don't forget the last one
|
||||
splits.append(normalized_string[last:])
|
||||
return splits
|
||||
|
||||
def pre_tokenize(self, pretok: PreTokenizedString):
|
||||
# Let's call split on the PreTokenizedString to split using `self.jieba_split`
|
||||
pretok.split(self.jieba_split)
|
||||
# Here we can call `pretok.split` multiple times if we want to apply
|
||||
# different algorithm, but we generally just need to call it once.
|
||||
pretok.split(self.odd_number_split)
|
||||
|
||||
|
||||
class CustomDecoder:
|
||||
def decode(self, tokens):
|
||||
def decode(self, tokens: List[str]) -> str:
|
||||
return "".join(tokens)
|
||||
|
||||
|
||||
class CustomNormalizer:
|
||||
def normalize(self, normalized):
|
||||
def normalize(self, normalized: NormalizedString):
|
||||
# Most of these can be replaced by a `Sequence` combining some provided Normalizer,
|
||||
# (ie Sequence([ NFKC(), Replace(Regex("\s+"), " "), Lowercase() ])
|
||||
# and it should be the prefered way. That being said, here is an example of the kind
|
||||
# of things that can be done here:
|
||||
normalized.nfkc()
|
||||
normalized.filter(lambda char: not char.isnumeric())
|
||||
normalized.replace(Regex("\s+"), " ")
|
||||
normalized.lowercase()
|
||||
|
||||
@ -36,12 +66,17 @@ tok.normalizer = Normalizer.custom(CustomNormalizer())
|
||||
tok.pre_tokenizer = PreTokenizer.custom(JiebaPreTokenizer())
|
||||
tok.decoder = Decoder.custom(CustomDecoder())
|
||||
|
||||
input1 = "永和服装饰品有限公司"
|
||||
print("PreTokenize:", input1)
|
||||
print(tok.pre_tokenizer.pre_tokenize_str(input1))
|
||||
input = "永和服装饰品有限公司"
|
||||
print("PreTokenize:", input)
|
||||
print(tok.pre_tokenizer.pre_tokenize_str(input))
|
||||
# [('永和', (0, 2)), ('服装', (2, 4)), ('饰品', (4, 6)), ('有限公司', (6, 10))]
|
||||
|
||||
input2 = "ℌ𝔢𝔩𝔩𝔬 𝔱𝔥𝔢𝔯𝔢 𝓂𝓎 𝒹ℯ𝒶𝓇 𝕕𝕖𝕒𝕣 𝕗𝕣𝕚𝕖𝕟𝕕!"
|
||||
print("Normalize:", input2)
|
||||
print(tok.normalizer.normalize_str(input2))
|
||||
# hello there my dear dear friend!
|
||||
input = "112233"
|
||||
print("PreTokenize:", input)
|
||||
print(tok.pre_tokenizer.pre_tokenize_str(input))
|
||||
# [('1', (0, 1)), ('122', (1, 4)), ('3', (4, 5)), ('3', (5, 6))]
|
||||
|
||||
input = "1234 ℌ𝔢𝔩𝔩𝔬 𝔱𝔥𝔢𝔯𝔢 𝓂𝓎 𝒹ℯ𝒶𝓇 𝕕𝕖𝕒𝕣 𝕗𝕣𝕚𝕖𝕟𝕕!"
|
||||
print("Normalize:", input)
|
||||
print(tok.normalizer.normalize_str(input))
|
||||
# " hello there my dear dear friend!"
|
||||
|
Reference in New Issue
Block a user