mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-08 05:38:23 +00:00
Use WhitespaceSplit for BPETokenizer
This commit is contained in:
@@ -3,7 +3,7 @@ import { BaseTokenizer } from "./base.tokenizer";
|
|||||||
import { Model, bpe } from "../bindings/models";
|
import { Model, bpe } from "../bindings/models";
|
||||||
import { Tokenizer } from "../bindings/tokenizer";
|
import { Tokenizer } from "../bindings/tokenizer";
|
||||||
import { sequenceNormalizer, nfkcNormalizer, lowercaseNormalizer } from "../bindings/normalizers";
|
import { sequenceNormalizer, nfkcNormalizer, lowercaseNormalizer } from "../bindings/normalizers";
|
||||||
import { whitespacePreTokenizer } from "../bindings/pre-tokenizers";
|
import { whitespaceSplitPreTokenizer } from "../bindings/pre-tokenizers";
|
||||||
import { bpeDecoder } from "../bindings/decoders";
|
import { bpeDecoder } from "../bindings/decoders";
|
||||||
import { bpeTrainer } from "../bindings/trainers";
|
import { bpeTrainer } from "../bindings/trainers";
|
||||||
|
|
||||||
@@ -103,7 +103,7 @@ export class BPETokenizer extends BaseTokenizer {
|
|||||||
|
|
||||||
const normalizer = sequenceNormalizer([nfkcNormalizer(), lowercaseNormalizer()]);
|
const normalizer = sequenceNormalizer([nfkcNormalizer(), lowercaseNormalizer()]);
|
||||||
tokenizer.setNormalizer(normalizer);
|
tokenizer.setNormalizer(normalizer);
|
||||||
tokenizer.setPreTokenizer(whitespacePreTokenizer());
|
tokenizer.setPreTokenizer(whitespaceSplitPreTokenizer());
|
||||||
|
|
||||||
const decoder = bpeDecoder(mergedOptions.suffix);
|
const decoder = bpeDecoder(mergedOptions.suffix);
|
||||||
tokenizer.setDecoder(decoder);
|
tokenizer.setDecoder(decoder);
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ class BPETokenizer(BaseTokenizer):
|
|||||||
NFKC.new(),
|
NFKC.new(),
|
||||||
Lowercase.new()
|
Lowercase.new()
|
||||||
])
|
])
|
||||||
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace.new()
|
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit.new()
|
||||||
tokenizer.decoder = decoders.BPEDecoder.new(suffix=suffix)
|
tokenizer.decoder = decoders.BPEDecoder.new(suffix=suffix)
|
||||||
|
|
||||||
parameters = {
|
parameters = {
|
||||||
|
|||||||
Reference in New Issue
Block a user