Use WhitespaceSplit for BPETokenizer

This commit is contained in:
Anthony MOI
2020-01-17 18:33:29 -05:00
parent fc601289eb
commit 395f605fd2
2 changed files with 3 additions and 3 deletions

View File

@@ -3,7 +3,7 @@ import { BaseTokenizer } from "./base.tokenizer";
import { Model, bpe } from "../bindings/models"; import { Model, bpe } from "../bindings/models";
import { Tokenizer } from "../bindings/tokenizer"; import { Tokenizer } from "../bindings/tokenizer";
import { sequenceNormalizer, nfkcNormalizer, lowercaseNormalizer } from "../bindings/normalizers"; import { sequenceNormalizer, nfkcNormalizer, lowercaseNormalizer } from "../bindings/normalizers";
import { whitespacePreTokenizer } from "../bindings/pre-tokenizers"; import { whitespaceSplitPreTokenizer } from "../bindings/pre-tokenizers";
import { bpeDecoder } from "../bindings/decoders"; import { bpeDecoder } from "../bindings/decoders";
import { bpeTrainer } from "../bindings/trainers"; import { bpeTrainer } from "../bindings/trainers";
@@ -103,7 +103,7 @@ export class BPETokenizer extends BaseTokenizer {
const normalizer = sequenceNormalizer([nfkcNormalizer(), lowercaseNormalizer()]); const normalizer = sequenceNormalizer([nfkcNormalizer(), lowercaseNormalizer()]);
tokenizer.setNormalizer(normalizer); tokenizer.setNormalizer(normalizer);
tokenizer.setPreTokenizer(whitespacePreTokenizer()); tokenizer.setPreTokenizer(whitespaceSplitPreTokenizer());
const decoder = bpeDecoder(mergedOptions.suffix); const decoder = bpeDecoder(mergedOptions.suffix);
tokenizer.setDecoder(decoder); tokenizer.setDecoder(decoder);

View File

@@ -30,7 +30,7 @@ class BPETokenizer(BaseTokenizer):
NFKC.new(), NFKC.new(),
Lowercase.new() Lowercase.new()
]) ])
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace.new() tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit.new()
tokenizer.decoder = decoders.BPEDecoder.new(suffix=suffix) tokenizer.decoder = decoders.BPEDecoder.new(suffix=suffix)
parameters = { parameters = {