mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-04 03:38:23 +00:00
pyo3 v0.18 migration (#1173)
* pyo v0.18 migration * Fix formatting issues of black
This commit is contained in:
@@ -24,7 +24,7 @@ class JiebaPreTokenizer:
|
||||
# Just an odd example...
|
||||
splits = []
|
||||
last = 0
|
||||
for (i, char) in enumerate(str(normalized_string)):
|
||||
for i, char in enumerate(str(normalized_string)):
|
||||
if char.isnumeric() and int(char) % 2 == 1:
|
||||
splits.append(normalized_string[last:i])
|
||||
last = i
|
||||
|
||||
@@ -11,6 +11,7 @@ bpe_tokenizer.normalizer = normalizers.Lowercase()
|
||||
# Initialize a dataset
|
||||
dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1")
|
||||
|
||||
|
||||
# Build an iterator over this dataset
|
||||
def batch_iterator():
|
||||
batch_length = 1000
|
||||
|
||||
Reference in New Issue
Block a user