fix sentencepiece tokenizer name

This commit is contained in:
Pierric Cistac
2020-01-10 16:03:47 -05:00
parent df67eadeca
commit 24c08b2530

View File

@ -6,7 +6,7 @@ import { metaspacePreTokenizer } from "../bindings/pre-tokenizers";
import { metaspaceDecoder } from "../bindings/decoders"; import { metaspaceDecoder } from "../bindings/decoders";
import { bpeTrainer } from "../bindings/trainers"; import { bpeTrainer } from "../bindings/trainers";
export interface SentencePieceTokenizerOptions extends OptionsWithDefaults { export interface SentencePieceBPETokenizerOptions extends OptionsWithDefaults {
dropout?: number; dropout?: number;
mergesFile?: string; mergesFile?: string;
vocabFile?: string; vocabFile?: string;
@ -27,7 +27,7 @@ interface OptionsWithDefaults {
unkToken?: string; unkToken?: string;
} }
export interface SentencePieceTrainOptions { export interface SentencePieceBPETrainOptions {
/** /**
* @default [] * @default []
*/ */
@ -57,14 +57,14 @@ export interface SentencePieceTrainOptions {
/** /**
* Represents the BPE algorithm, with the pretokenization used by SentencePiece * Represents the BPE algorithm, with the pretokenization used by SentencePiece
*/ */
export class SentencePieceTokenizer extends BaseTokenizer { export class SentencePieceBPETokenizer extends BaseTokenizer {
private static readonly defaultOptions: SentencePieceTokenizerOptions & Required<OptionsWithDefaults> = { private static readonly defaultOptions: SentencePieceBPETokenizerOptions & Required<OptionsWithDefaults> = {
addPrefixSpace: true, addPrefixSpace: true,
replacement: '▁', replacement: '▁',
unkToken: '<unk>' unkToken: '<unk>'
}; };
private readonly defaultTrainOptions: Required<SentencePieceTrainOptions> = { private readonly defaultTrainOptions: Required<SentencePieceBPETrainOptions> = {
initialAlphabet: [], initialAlphabet: [],
limitAlphabet: 1000, limitAlphabet: 1000,
minFrequency: 2, minFrequency: 2,
@ -77,7 +77,7 @@ export class SentencePieceTokenizer extends BaseTokenizer {
super(tokenizer); super(tokenizer);
} }
static async fromOptions(options?: SentencePieceTokenizerOptions): Promise<SentencePieceTokenizer> { static async fromOptions(options?: SentencePieceBPETokenizerOptions): Promise<SentencePieceBPETokenizer> {
const mergedOptions = { ...this.defaultOptions, ...options }; const mergedOptions = { ...this.defaultOptions, ...options };
let model: Model; let model: Model;
@ -103,7 +103,7 @@ export class SentencePieceTokenizer extends BaseTokenizer {
const decoder = metaspaceDecoder(mergedOptions.replacement, mergedOptions.addPrefixSpace); const decoder = metaspaceDecoder(mergedOptions.replacement, mergedOptions.addPrefixSpace);
tokenizer.setDecoder(decoder); tokenizer.setDecoder(decoder);
return new SentencePieceTokenizer(tokenizer); return new SentencePieceBPETokenizer(tokenizer);
} }
/** /**
@ -112,7 +112,7 @@ export class SentencePieceTokenizer extends BaseTokenizer {
* @param files Files to use for training * @param files Files to use for training
* @param [options] Training options * @param [options] Training options
*/ */
async train(files: string[], options?: SentencePieceTrainOptions): Promise<void> { async train(files: string[], options?: SentencePieceBPETrainOptions): Promise<void> {
const mergedOptions = { ...this.defaultTrainOptions, ...options }; const mergedOptions = { ...this.defaultTrainOptions, ...options };
const trainer = bpeTrainer(mergedOptions); const trainer = bpeTrainer(mergedOptions);