mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-05 12:18:20 +00:00
Adding a new normalizer that strips accents by removing combining (#416)
* Adding a new normalizer that strips accents by removing combining characters in unicode strings. * Adding Node bindings + better normalizer impl. * Doc comment -> Regular comment.
This commit is contained in:
@@ -9,6 +9,7 @@ NFKC = normalizers.NFKC
|
||||
Sequence = normalizers.Sequence
|
||||
Lowercase = normalizers.Lowercase
|
||||
Strip = normalizers.Strip
|
||||
StripAccents = normalizers.StripAccents
|
||||
Nmt = normalizers.Nmt
|
||||
Precompiled = normalizers.Precompiled
|
||||
|
||||
|
||||
@@ -99,6 +99,12 @@ class Strip(Normalizer):
|
||||
def __init__(self, left: bool = True, right: bool = True) -> Normalizer:
|
||||
pass
|
||||
|
||||
class StripAccents(Normalizer):
|
||||
""" StripAccents normalizer """
|
||||
|
||||
def __init__(self) -> Normalizer:
|
||||
pass
|
||||
|
||||
class Nmt(Normalizer):
|
||||
""" Nmt normalizer """
|
||||
|
||||
|
||||
Reference in New Issue
Block a user