mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-08 13:48:19 +00:00
Use WhitespaceSplit for BPETokenizer
This commit is contained in:
@@ -3,7 +3,7 @@ import { BaseTokenizer } from "./base.tokenizer";
|
||||
import { Model, bpe } from "../bindings/models";
|
||||
import { Tokenizer } from "../bindings/tokenizer";
|
||||
import { sequenceNormalizer, nfkcNormalizer, lowercaseNormalizer } from "../bindings/normalizers";
|
||||
import { whitespacePreTokenizer } from "../bindings/pre-tokenizers";
|
||||
import { whitespaceSplitPreTokenizer } from "../bindings/pre-tokenizers";
|
||||
import { bpeDecoder } from "../bindings/decoders";
|
||||
import { bpeTrainer } from "../bindings/trainers";
|
||||
|
||||
@@ -103,7 +103,7 @@ export class BPETokenizer extends BaseTokenizer {
|
||||
|
||||
const normalizer = sequenceNormalizer([nfkcNormalizer(), lowercaseNormalizer()]);
|
||||
tokenizer.setNormalizer(normalizer);
|
||||
tokenizer.setPreTokenizer(whitespacePreTokenizer());
|
||||
tokenizer.setPreTokenizer(whitespaceSplitPreTokenizer());
|
||||
|
||||
const decoder = bpeDecoder(mergedOptions.suffix);
|
||||
tokenizer.setDecoder(decoder);
|
||||
|
||||
@@ -30,7 +30,7 @@ class BPETokenizer(BaseTokenizer):
|
||||
NFKC.new(),
|
||||
Lowercase.new()
|
||||
])
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace.new()
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit.new()
|
||||
tokenizer.decoder = decoders.BPEDecoder.new(suffix=suffix)
|
||||
|
||||
parameters = {
|
||||
|
||||
Reference in New Issue
Block a user