mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-03 19:28:20 +00:00
Updating convert scripts with Replace normalizer.
This commit is contained in:
@@ -11,6 +11,7 @@ from tokenizers.normalizers import (
|
|||||||
Sequence,
|
Sequence,
|
||||||
BertNormalizer,
|
BertNormalizer,
|
||||||
Precompiled,
|
Precompiled,
|
||||||
|
Replace,
|
||||||
)
|
)
|
||||||
from tokenizers.pre_tokenizers import (
|
from tokenizers.pre_tokenizers import (
|
||||||
Digits,
|
Digits,
|
||||||
@@ -162,8 +163,7 @@ class AlbertConverter(SpmConverter):
|
|||||||
]
|
]
|
||||||
|
|
||||||
def normalizer(self, proto):
|
def normalizer(self, proto):
|
||||||
normalizers = []
|
normalizers = [Replace("``", '"'), Replace("''", '"')]
|
||||||
# TODO Missing Replace quotes
|
|
||||||
if not self.original_tokenizer.keep_accents:
|
if not self.original_tokenizer.keep_accents:
|
||||||
normalizers.append(NFKD())
|
normalizers.append(NFKD())
|
||||||
normalizers.append(StripAccents())
|
normalizers.append(StripAccents())
|
||||||
@@ -299,8 +299,7 @@ class XLNetConverter(SpmConverter):
|
|||||||
]
|
]
|
||||||
|
|
||||||
def normalizer(self, proto):
|
def normalizer(self, proto):
|
||||||
# TODO Missing Replace quotes
|
normalizers = [Replace("``", '"'), Replace("''", '"')]
|
||||||
normalizers = []
|
|
||||||
if not self.original_tokenizer.keep_accents:
|
if not self.original_tokenizer.keep_accents:
|
||||||
normalizers.append(NFKD())
|
normalizers.append(NFKD())
|
||||||
normalizers.append(StripAccents())
|
normalizers.append(StripAccents())
|
||||||
@@ -384,9 +383,6 @@ def check(pretrained, filename):
|
|||||||
for i, line in enumerate(f):
|
for i, line in enumerate(f):
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
|
|
||||||
# TODO in normalizer
|
|
||||||
line = line.replace("``", '"').replace("''", '"')
|
|
||||||
|
|
||||||
start = now()
|
start = now()
|
||||||
ids = transformer_tokenizer.encode(line)
|
ids = transformer_tokenizer.encode(line)
|
||||||
trans = now()
|
trans = now()
|
||||||
|
|||||||
Reference in New Issue
Block a user