mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-05 12:18:20 +00:00
Adding a new "Replace" normalizer that takes a string and replaces it
with another String (for now).
This commit is contained in:
@@ -12,6 +12,7 @@ Strip = normalizers.Strip
|
||||
StripAccents = normalizers.StripAccents
|
||||
Nmt = normalizers.Nmt
|
||||
Precompiled = normalizers.Precompiled
|
||||
Replace = normalizers.Replace
|
||||
|
||||
|
||||
NORMALIZERS = {"nfc": NFC, "nfd": NFD, "nfkc": NFKC, "nfkd": NFKD}
|
||||
|
||||
@@ -112,11 +112,17 @@ class Nmt(Normalizer):
|
||||
pass
|
||||
|
||||
class Precompiled(Normalizer):
|
||||
""" SpmNmtNfkc normalizer """
|
||||
""" Precompiled normalizer """
|
||||
|
||||
def __init__(self, precompiled_charsmap: bytes) -> Normalizer:
|
||||
pass
|
||||
|
||||
class Replace(Normalizer):
|
||||
""" Replace normalizer """
|
||||
|
||||
def __init__(self, pattern: str, content: str) -> Normalizer:
|
||||
pass
|
||||
|
||||
def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
|
||||
"""
|
||||
Instanciate unicode normalizer from the normalizer name
|
||||
|
||||
Reference in New Issue
Block a user