big big big

This commit is contained in:
Pierric Cistac
2020-01-10 14:49:13 -05:00
parent 34875d5771
commit 80f6d58177
24 changed files with 762 additions and 345 deletions

View File

@ -4,32 +4,30 @@
*/ */
declare class Decoder {} declare class Decoder {}
export namespace decoders { /**
/** * Instantiate a new ByteLevel Decoder
* Instantiate a new ByteLevel Decoder */
*/ export function byteLevelDecoder(): Decoder;
export function ByteLevel(): Decoder;
/** /**
* Instantiate a new WordPiece Decoder * Instantiate a new WordPiece Decoder
* @param {string} [prefix='##'] The prefix to use for subwords that are not a beginning-of-word * @param {string} [prefix='##'] The prefix to use for subwords that are not a beginning-of-word
*/ */
export function WordPiece(prefix?: string): Decoder; export function wordPieceDecoder(prefix?: string): Decoder;
/** /**
* Instantiate a new Metaspace * Instantiate a new Metaspace
* *
* @param {string} [replacement='▁'] The replacement character. * @param {string} [replacement='▁'] The replacement character.
* Must be exactly one character. By default we use the `▁` (U+2581) meta symbol (same as in SentencePiece). * Must be exactly one character. By default we use the `▁` (U+2581) meta symbol (same as in SentencePiece).
* @param {boolean} [addPrefixSpace=true] Whether to add a space to the first word if there isn't already one. * @param {boolean} [addPrefixSpace=true] Whether to add a space to the first word if there isn't already one.
* This lets us treat `hello` exactly like `say hello`. * This lets us treat `hello` exactly like `say hello`.
*/ */
export function Metaspace(replacement?: string, addPrefixSpace?: boolean): Decoder; export function metaspaceDecoder(replacement?: string, addPrefixSpace?: boolean): Decoder;
/** /**
* Instantiate a new BPEDecoder * Instantiate a new BPE Decoder
* @param {string} [suffix='</w>'] The suffix that was used to caracterize an end-of-word. * @param {string} [suffix='</w>'] The suffix that was used to caracterize an end-of-word.
* This suffix will be replaced by whitespaces during the decoding * This suffix will be replaced by whitespaces during the decoding
*/ */
export function BPEDecoder(suffix?: string): Decoder; export function bpeDecoder(suffix?: string): Decoder;
}

View File

@ -0,0 +1,8 @@
var addon = require('../../native');
module.exports = {
byteLevelDecoder: addon.decoders_ByteLevel,
wordPieceDecoder: addon.decoders_WordPiece,
metaspaceDecoder: addon.decoders_Metaspace,
bpeDecoder: addon.decoders_BPEDecoder
};

View File

@ -1,7 +0,0 @@
export { decoders } from './decoders';
export { models } from './models';
export { normalizers } from './normalizers';
export { preTokenizers } from './pre-tokenizers';
export { postProcessors } from './post-processors';
export { Tokenizer } from './tokenizer';
export { trainers } from './trainers'

View File

@ -1,19 +0,0 @@
var addon = require('../../native');
exports.Tokenizer = addon.tokenizer_Tokenizer;
exports.models = {
BPE: {
fromFiles: addon.models_BPE_from_files,
empty: addon.models_BPE_empty,
},
WordPiece: addon.models_WordPiece,
}
exports.decoders = {
ByteLevel: addon.decoders_ByteLevel,
WordPiece: addon.decoders_WordPiece,
Metaspace: addon.decoders_Metaspace,
BPEDecoder: addon.decoders_BPEDecoder,
}
exports.post_processors = {
BertProcessing: addon.processors_BertProcessing,
}

View File

@ -6,73 +6,103 @@ declare class Model {
} }
export namespace models { export namespace bpe {
export namespace BPE { export interface BPEModelOptions {
export interface BPEOptions {
/**
* The number of words that the BPE cache can contain. The cache allows
* to speed-up the process by keeping the result of the merge operations
* for a number of words.
*/
cacheCapacity?: number;
/**
* The BPE dropout to use. Must be an float between 0 and 1
*/
dropout?: number;
/**
* The unknown token to be used by the model
*/
unkToken?: string;
/**
* The prefix to attach to subword units that don't represent a beginning of word
*/
continuingSubwordPrefix?: string;
/**
* The suffix to attach to subword units that represent an end of word
*/
endOfWordSuffix?: string;
}
/** /**
* Instantiate a BPE model from the given vocab and merges files * The number of words that the BPE cache can contain. The cache allows
* * to speed-up the process by keeping the result of the merge operations
* @param {string} vocab Path to a vocabulary JSON file * for a number of words.
* @param {string} merges Path to a merge file
* @param {BPEOptions} [options] BPE model options
* @param {(err: any, model: Model) => void} __callback Callback called when model is loaded
*/ */
export function fromFiles( cacheCapacity?: number;
vocab: string,
merges: string,
options: BPEOptions | null,
__callback: (err: any, model: Model) => void
): void;
/** /**
* Instantiate an empty BPE Model * The BPE dropout to use. Must be an float between 0 and 1
*/ */
export function empty(): Model; dropout?: number;
/**
* The unknown token to be used by the model
*/
unkToken?: string;
/**
* The prefix to attach to subword units that don't represent a beginning of word
*/
continuingSubwordPrefix?: string;
/**
* The suffix to attach to subword units that represent an end of word
*/
endOfWordSuffix?: string;
} }
export namespace WordPiece { /**
/** * Instantiate a BPE model from the given vocab and merges files
* Instantiate a WordPiece model from the given vocab file *
* * @param vocab Path to a vocabulary JSON file
* @param {string} vocab Path to a vocabulary file * @param merges Path to a merge file
* @param {string} [unkToken] The unknown token to be used by the model * @param [options] BPE model options
* @param {number} [maxInputCharsPerWord] The maximum number of characters to authorize in a single word */
* @param {(err: any, model: Model) => void} __callback Callback called when model is loaded export function fromFiles(
*/ vocab: string,
export function fromFiles( merges: string,
vocab: string, options?: BPEModelOptions
unkToken: string, ): Model;
maxInputCharsPerWord: number | null,
__callback: (err: any, model: Model) => void
): void;
/** /**
* Instantiate an empty WordPiece model * Instantiate a BPE model from the given vocab and merges files
*/ *
export function empty(): Model; * @param vocab Path to a vocabulary JSON file
} * @param merges Path to a merge file
* @param options BPE model options
* @param __callback Callback called when model is loaded
*/
// export function fromFiles(
// vocab: string,
// merges: string,
// options: BPEModelOptions | null,
// __callback: (err: any, model: Model) => void
// ): void;
/**
* Instantiate an empty BPE Model
*/
export function empty(): Model;
}
export namespace wordPiece {
export interface WordPieceModelOptions {
/**
* The maximum number of characters to authorize in a single word.
* @default 100
*/
maxInputCharsPerWord?: number;
/**
* The unknown token to be used by the model.
* @default "[UNK]"
*/
unkToken?: string;
}
/**
* Instantiate a WordPiece model from the given vocab file
*
* @param {string} vocab Path to a vocabulary file
* @param [options] WordPiece model options
*/
export function fromFiles(vocab: string, options?: WordPieceModelOptions): Model;
/**
* Instantiate a WordPiece model from the given vocab file
*
* @param vocab Path to a vocabulary file
* @param options WordPiece model options
* @param __callback Callback called when model is loaded
*/
// export function fromFiles(
// vocab: string,
// options: WordPieceModelOptions | null,
// __callback: (err: any, model: Model) => void
// ): void;
/**
* Instantiate an empty WordPiece model
*/
export function empty(): Model;
} }

View File

@ -0,0 +1,12 @@
var addon = require('../../native');
module.exports = {
bpe: {
fromFiles: addon.models_BPE_from_files,
empty: addon.models_BPE_empty,
},
wordPiece: {
fromFiles: addon.models_WordPiece_from_files,
empty: addon.models_WordPiece_empty,
}
}

View File

@ -4,68 +4,66 @@
*/ */
declare class Normalizer {} declare class Normalizer {}
export namespace normalizers { interface BertNormalizerOptions {
interface BertNormalizerOptions {
/**
* Whether to clean the text, by removing any control characters
* and replacing all whitespaces by the classic one.
* @default true
*/
cleanText?: boolean;
/**
* Whether to handle chinese chars by putting spaces around them.
* @default true
*/
handleChineseChars?: boolean;
/**
* Whether to lowercase.
* @default true
*/
lowercase?: boolean;
/**
* Whether to strip all accents.
* @default true
*/
stripAccents?: boolean;
}
/** /**
* Instantiate a Bert Normalizer with the given options * Whether to clean the text, by removing any control characters
* * and replacing all whitespaces by the classic one.
* @param [options] Normalizer options * @default true
* @returns {Normalizer} Bert Normalizer. Takes care of normalizing raw text before giving it to a Bert model.
* This includes cleaning the text, handling accents, chinese chars and lowercasing
*/ */
export function bertNormalizer(options?: BertNormalizerOptions): Normalizer; cleanText?: boolean;
/** /**
* Returns a new NFD Unicode Normalizer * Whether to handle chinese chars by putting spaces around them.
* @default true
*/ */
export function nfd(): Normalizer; handleChineseChars?: boolean;
/** /**
* Returns a new NFKD Unicode Normalizer * Whether to lowercase.
* @default true
*/ */
export function nfkd(): Normalizer; lowercase?: boolean;
/** /**
* Returns a new NFC Unicode Normalizer * Whether to strip all accents.
* @default true
*/ */
export function nfc(): Normalizer; stripAccents?: boolean;
/**
* Returns a new NFKC Unicode Normalizer
*/
export function nfkc(): Normalizer;
/**
* Instantiate a new Normalization Sequence using the given normalizers
* @param normalizers A list of Normalizer to be run as a sequence
*/
export function sequence(normalizers: Normalizer[]): Normalizer;
/**
* Returns a new Lowercase Normalizer
*/
export function lowercase(): Normalizer;
} }
/**
* Instantiate a Bert Normalizer with the given options
*
* @param [options] Normalizer options
* @returns {Normalizer} Bert Normalizer. Takes care of normalizing raw text before giving it to a Bert model.
* This includes cleaning the text, handling accents, chinese chars and lowercasing
*/
export function bertNormalizer(options?: BertNormalizerOptions): Normalizer;
/**
* Returns a new NFD Unicode Normalizer
*/
export function nfdNormalizer(): Normalizer;
/**
* Returns a new NFKD Unicode Normalizer
*/
export function nfkdNormalizer(): Normalizer;
/**
* Returns a new NFC Unicode Normalizer
*/
export function nfcNormalizer(): Normalizer;
/**
* Returns a new NFKC Unicode Normalizer
*/
export function nfkcNormalizer(): Normalizer;
/**
* Instantiate a new Normalization Sequence using the given normalizers
* @param normalizers A list of Normalizer to be run as a sequence
*/
export function sequenceNormalizer(normalizers: Normalizer[]): Normalizer;
/**
* Returns a new Lowercase Normalizer
*/
export function lowercaseNormalizer(): Normalizer;

View File

@ -0,0 +1,11 @@
var addon = require('../../native');
module.exports = {
bertNormalizer: addon.normalizers_BertNormalizer,
nfdNormalizer: addon.normalizers_NFD,
nfkdNormalizer: addon.normalizers_NFKD,
nfcNormalizer: addon.normalizers_NFC,
nfkcNormalizer: addon.normalizers_NFKC,
sequenceNormalizer: addon.normalizers_Sequence,
lowercaseNormalizer: addon.normalizers_Lowercase
};

View File

@ -4,12 +4,10 @@
*/ */
declare class PostProcessor {} declare class PostProcessor {}
export namespace postProcessors { /**
/** * Instantiate a new BertProcessing with the given tokens
* Instantiate a new BertProcessing with the given tokens *
* * @param {[string, number]} sep A tuple with the string representation of the SEP token, and its id
* @param {[string, number]} sep A tuple with the string representation of the SEP token, and its id * @param {[string, number]} cls A tuple with the string representation of the CLS token, and its id
* @param {[string, number]} cls A tuple with the string representation of the CLS token, and its id */
*/ export function bertProcessing(sep: [string, number], cls: [string, number]): PostProcessor;
export function BertProcessing(sep: [string, number], cls: [string, number]): PostProcessor;
}

View File

@ -0,0 +1,5 @@
var addon = require('../../native');
module.exports = {
bertProcessing: addon.processors_BertProcessing
};

View File

@ -4,50 +4,46 @@
*/ */
declare class PreTokenizer {} declare class PreTokenizer {}
export namespace preTokenizers { /**
export namespace byteLevel { * Instantiate a new ByteLevel PreTokenizer
/** *
* Instantiate a new ByteLevel PreTokenizer * @param {boolean} [addPrefixSpace=true] Whether to add a space to the first word if there isn't already one.
* * This lets us treat `hello` exactly like `say hello`.
* @param {boolean} [addPrefixSpace=true] Whether to add a space to the first word if there isn't already one. * @returns {PreTokenizer} ByteLevel PreTokenizer.
* This lets us treat `hello` exactly like `say hello`. * This pre-tokenizer takes care of replacing all bytes of the given string
* @returns {PreTokenizer} ByteLevel PreTokenizer. * with a corresponding representation, as well as splitting into words.
* This pre-tokenizer takes care of replacing all bytes of the given string */
* with a corresponding representation, as well as splitting into words. export function byteLevelPreTokenizer(addPrefixSpace?: boolean): PreTokenizer;
*/
export function byteLevel(addPrefixSpace?: boolean): PreTokenizer;
/**
* Returns the alphabet used by the ByteLevel PreTokenizer.
* Since the ByteLevel works as its name suggests, at the byte level, it
* encodes any byte to one visible character. This means that there is a
* total of 256 different characters composing this alphabet.
*/
export function byteLevelAlphabet(): string[];
}
/** /**
* Returns a Whitespace PreTokenizer * Returns the alphabet used by the ByteLevel PreTokenizer.
* This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+` * Since the ByteLevel works as its name suggests, at the byte level, it
*/ * encodes any byte to one visible character. This means that there is a
export function whitespace(): PreTokenizer; * total of 256 different characters composing this alphabet.
*/
export function byteLevelAlphabet(): string[];
/** /**
* Returns a new Bert PreTokenizer. * Returns a Whitespace PreTokenizer
* This pre-tokenizer splits tokens on spaces, and also on punctuation. * This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
* Each occurence of a punctuation character will be treated separately. */
*/ export function whitespacePreTokenizer(): PreTokenizer;
export function bertPreTokenizer(): PreTokenizer;
/** /**
* Returns a new Metaspace Tokenizer. * Returns a new Bert PreTokenizer.
* This pre-tokenizer replaces any whitespace by the provided replacement character. * This pre-tokenizer splits tokens on spaces, and also on punctuation.
* It then tries to split on these spaces. * Each occurence of a punctuation character will be treated separately.
* */
* @param {string} [replacement="▁"] The replacement character. Must be exactly one character. export function bertPreTokenizer(): PreTokenizer;
* By default we use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
* @param {boolean} [addPrefixSpace] Whether to add a space to the first word if there isn't already one. /**
* This lets us treat `hello` exactly like `say hello`. * Returns a new Metaspace Tokenizer.
*/ * This pre-tokenizer replaces any whitespace by the provided replacement character.
export function metaspace(replacement?: string, addPrefixSpace?: boolean): PreTokenizer; * It then tries to split on these spaces.
} *
* @param {string} [replacement="▁"] The replacement character. Must be exactly one character.
* By default we use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
* @param {boolean} [addPrefixSpace] Whether to add a space to the first word if there isn't already one.
* This lets us treat `hello` exactly like `say hello`.
*/
export function metaspacePreTokenizer(replacement?: string, addPrefixSpace?: boolean): PreTokenizer;

View File

@ -0,0 +1,9 @@
var addon = require('../../native');
module.exports = {
byteLevelPreTokenizer: addon.pre_tokenizers_ByteLevel,
byteLevelAlphabet: addon.pre_tokenizers_ByteLevel_Alphabet,
whitespacePreTokenizer: addon.pre_tokenizers_Whitespace,
bertPreTokenizer: addon.pre_tokenizers_BertPreTokenizer,
metaspacePreTokenizer: addon.pre_tokenizers_Metaspace
};

View File

@ -0,0 +1,3 @@
var addon = require('../../native');
module.exports.Tokenizer = addon.tokenizer_Tokenizer;

View File

@ -4,59 +4,57 @@
*/ */
declare class Trainer {} declare class Trainer {}
export namespace trainers { interface TrainerOptions {
interface TrainerOptions {
/**
* A prefix to be used for every subword that is not a beginning-of-word.
*/
continuingSubwordPrefix?: string;
/**
* A suffix to be used for every subword that is a end-of-word.
*/
endOfWordSuffix?: string;
/**
* A list of characters to include in the initial alphabet, even
* if not seen in the training dataset.
* If the strings contains more than one character, only the first one
* is kept.
* @default []
*/
initialAlphabet?: string[];
/**
* The maximum different characters to keep in the alphabet.
*/
limitAlphabet?: number;
/**
* The minimum frequency a pair should have in order to be merged.
* @default 2
*/
minFrequency?: number;
/**
* Whether to show progress bars while training.
* @default true
*/
showProgress?: boolean;
/**
* A list of special tokens the model should know of.
* @default []
*/
specialTokens?: string[];
/**
* The size of the final vocabulary, including all tokens and alphabet.
* @default 30000
*/
vocabSize?: number;
}
/** /**
* Instantiate a new BPE Trainer * A prefix to be used for every subword that is not a beginning-of-word.
* @param {TrainerOptions} [options] BPE Trainer options
*/ */
export function bpeTrainer(options?: TrainerOptions): Trainer; continuingSubwordPrefix?: string;
/** /**
* Instantiate a new WordPiece Trainer * A suffix to be used for every subword that is a end-of-word.
* @param {TrainerOptions} [options] WordPiece Trainer options
*/ */
export function wordPieceTrainer(options?: TrainerOptions): Trainer; endOfWordSuffix?: string;
/**
* A list of characters to include in the initial alphabet, even
* if not seen in the training dataset.
* If the strings contains more than one character, only the first one
* is kept.
* @default []
*/
initialAlphabet?: string[];
/**
* The maximum different characters to keep in the alphabet.
*/
limitAlphabet?: number;
/**
* The minimum frequency a pair should have in order to be merged.
* @default 2
*/
minFrequency?: number;
/**
* Whether to show progress bars while training.
* @default true
*/
showProgress?: boolean;
/**
* A list of special tokens the model should know of.
* @default []
*/
specialTokens?: string[];
/**
* The size of the final vocabulary, including all tokens and alphabet.
* @default 30000
*/
vocabSize?: number;
} }
/**
* Instantiate a new BPE Trainer
* @param {TrainerOptions} [options] BPE Trainer options
*/
export function bpeTrainer(options?: TrainerOptions): Trainer;
/**
* Instantiate a new WordPiece Trainer
* @param {TrainerOptions} [options] WordPiece Trainer options
*/
export function wordPieceTrainer(options?: TrainerOptions): Trainer;

View File

@ -0,0 +1,6 @@
var addon = require('../../native');
module.exports = {
bpeTrainer: addon.trainers_BPETrainer,
wordPieceTrainer: addon.trainers_WordPieceTrainer
};

View File

@ -2,7 +2,7 @@ import { promisify } from "util";
import { Encoding, Tokenizer } from "../bindings/tokenizer"; import { Encoding, Tokenizer } from "../bindings/tokenizer";
export class BaseTokenizer { export class BaseTokenizer {
constructor(private tokenizer: Tokenizer) {} constructor(protected tokenizer: Tokenizer) {}
/** /**
* Encode the given sequence * Encode the given sequence
@ -11,7 +11,7 @@ export class BaseTokenizer {
* @param {(string | null)} pair The optional pair sequence * @param {(string | null)} pair The optional pair sequence
*/ */
async encode(sequence: string, pair?: string): Promise<Encoding> { async encode(sequence: string, pair?: string): Promise<Encoding> {
const encode = promisify(this.tokenizer.encode); const encode = promisify(this.tokenizer.encode.bind(this.tokenizer));
return encode(sequence, pair ?? null); return encode(sequence, pair ?? null);
} }
@ -22,7 +22,7 @@ export class BaseTokenizer {
* The list can contain both at the same time. * The list can contain both at the same time.
*/ */
async encodeBatch(sequences: (string | [string, string])[]): Promise<Encoding[]> { async encodeBatch(sequences: (string | [string, string])[]): Promise<Encoding[]> {
const encodeBatch = promisify(this.tokenizer.encodeBatch); const encodeBatch = promisify(this.tokenizer.encodeBatch.bind(this.tokenizer));
return encodeBatch(sequences); return encodeBatch(sequences);
} }
} }

View File

@ -1,9 +1,14 @@
import { promisify } from "util"; import { promisify } from "util";
import { BaseTokenizer } from "./base.tokenizer"; import { BaseTokenizer } from "./base.tokenizer";
import { Tokenizer } from "../bindings/tokenizer"; import { Tokenizer } from "../bindings/tokenizer";
import { Model, models } from "../bindings/models"; import { Model, wordPiece } from "../bindings/models";
import { bertNormalizer } from "../bindings/normalizers";
import { bertPreTokenizer } from "../bindings/pre-tokenizers";
import { bertProcessing } from "../bindings/post-processors";
import { wordPieceDecoder } from "../bindings/decoders";
import { wordPieceTrainer } from "../bindings/trainers";
interface BertWordpieceOptions { export interface BertWordPieceOptions {
/** /**
* @default true * @default true
*/ */
@ -43,42 +48,121 @@ interface BertWordpieceOptions {
wordpiecesPrefix?: string; wordpiecesPrefix?: string;
} }
const defaultBertOptions: Required<Omit<BertWordpieceOptions, 'vocabFile'>> & { vocabFile?: string } = { export interface BertWordPieceTrainOptions {
addSpecialTokens: true, /**
cleanText: true, * @default []
clsToken: '[CLS]', */
handleChineseChars: true, initialAlphabet?: string[];
lowercase: true, /**
sepToken: '[SEP]', * @default 1000
stripAccents: true, */
unkToken: '[UNK]', limitAlphabet?: number;
wordpiecesPrefix: '##' /**
}; * @default 2
*/
/** minFrequency?: number;
* Instantiate and returns a new Bert WordPiece tokenizer /**
* @param options * @default true
*/ */
export async function getBertWordpieceTokenizer(options?: BertWordpieceOptions): Promise<BertWordpieceTokenizer> { showProgress?: boolean;
const mergedOptions = { ...defaultBertOptions, ...options }; /**
* @default ["[UNK]", "[SEP]", "[CLS]"]
let model: Model; */
if (mergedOptions.vocabFile) { specialTokens?: string[];
const fromFiles = promisify(models.WordPiece.fromFiles); /**
model = await fromFiles(mergedOptions.vocabFile, mergedOptions.unkToken, null); * @default 30000
} else { */
model = models.WordPiece.empty(); vocabSize?: number;
} /**
* @default "##"
const tokenizer = new Tokenizer(model); */
return new BertWordpieceTokenizer(tokenizer); wordpiecesPrefix?: string;
} }
/** /**
* Bert WordPiece Tokenizer * Bert WordPiece Tokenizer
*/ */
class BertWordpieceTokenizer extends BaseTokenizer { export class BertWordPieceTokenizer extends BaseTokenizer {
constructor(tokenizer: Tokenizer) { private static readonly defaultBertOptions:
Required<Omit<BertWordPieceOptions, "vocabFile">> & { vocabFile?: string } = {
addSpecialTokens: true,
cleanText: true,
clsToken: "[CLS]",
handleChineseChars: true,
lowercase: true,
sepToken: "[SEP]",
stripAccents: true,
unkToken: "[UNK]",
wordpiecesPrefix: "##"
};
private readonly defaultTrainOptions: Required<BertWordPieceTrainOptions> = {
initialAlphabet: [],
limitAlphabet: 1000,
minFrequency: 2,
showProgress: true,
specialTokens: ['<unk>'],
vocabSize: 30000,
wordpiecesPrefix: "##"
};
private constructor(tokenizer: Tokenizer) {
super(tokenizer); super(tokenizer);
} }
/**
* Instantiate and returns a new Bert WordPiece tokenizer
* @param [options] Optional tokenizer options
*/
static async fromOptions(options?: BertWordPieceOptions): Promise<BertWordPieceTokenizer> {
const mergedOptions = { ...this.defaultBertOptions, ...options };
let model: Model;
if (mergedOptions.vocabFile) {
// const fromFiles = promisify(WordPiece.fromFiles);
model = wordPiece.fromFiles(mergedOptions.vocabFile, { unkToken: mergedOptions.unkToken });
// model = await fromFiles(mergedOptions.vocabFile, mergedOptions.unkToken, null);
} else {
model = wordPiece.empty();
}
const tokenizer = new Tokenizer(model);
const normalizer = bertNormalizer(mergedOptions);
tokenizer.setNormalizer(normalizer);
tokenizer.setPreTokenizer(bertPreTokenizer());
const sepTokenId = tokenizer.tokenToId(mergedOptions.sepToken);
if (sepTokenId === undefined) {
throw new Error("sepToken not found in the vocabulary");
}
const clsTokenId = tokenizer.tokenToId(mergedOptions.clsToken);
if (clsTokenId === undefined) {
throw new Error("clsToken not found in the vocabulary");
}
if (mergedOptions.addSpecialTokens) {
const processor = bertProcessing([mergedOptions.sepToken, sepTokenId], [mergedOptions.clsToken, clsTokenId]);
tokenizer.setPostProcessor(processor);
}
const decoder = wordPieceDecoder(mergedOptions.wordpiecesPrefix);
tokenizer.setDecoder(decoder);
return new BertWordPieceTokenizer(tokenizer);
}
/**
* Train the model using the given files
*
* @param files Files to use for training
* @param [options] Training options
*/
async train(files: string[], options?: BertWordPieceTrainOptions): Promise<void> {
const mergedOptions = { ...this.defaultTrainOptions, ...options };
const trainer = wordPieceTrainer(mergedOptions);
this.tokenizer.train(trainer, files);
}
} }

View File

@ -1,52 +1,126 @@
import { promisify } from "util"; import { promisify } from "util";
import { BaseTokenizer } from "./base.tokenizer"; import { BaseTokenizer } from "./base.tokenizer";
import { Model, models } from "../bindings/models"; import { Model, bpe } from "../bindings/models";
import { Tokenizer } from "../bindings/tokenizer"; import { Tokenizer } from "../bindings/tokenizer";
import { sequenceNormalizer, nfkcNormalizer, lowercaseNormalizer } from "../bindings/normalizers";
import { whitespacePreTokenizer } from "../bindings/pre-tokenizers";
import { bpeDecoder } from "../bindings/decoders";
import { bpeTrainer } from "../bindings/trainers";
interface BPEOptions { export interface BPETokenizerOptions {
dropout?: number; dropout?: number;
mergesFile?: string; mergesFile?: string;
/**
* @default "</w>"
*/
suffix?: string; suffix?: string;
/**
* @default "<unk>"
*/
unkToken?: string; unkToken?: string;
vocabFile?: string; vocabFile?: string;
} }
const defaultBPEOptions: BPEOptions & Required<Pick<BPEOptions, 'unkToken' | 'suffix'>> = { export interface BPETokenizerTrainOptions {
suffix: '</w>', /**
unkToken: '<unk>' * @default []
}; */
initialAlphabet?: string[];
/** /**
* Instantiate and returns a new BPE tokenizer * @default 1000
* @param options */
*/ limitAlphabet?: number;
export async function getBPETokenizer(options?: BPEOptions): Promise<BPETokenizer> { /**
const mergedOptions = { ...defaultBPEOptions, ...options }; * @default 2
*/
let model: Model; minFrequency?: number;
if (mergedOptions.vocabFile && mergedOptions.mergesFile) { /**
const fromFiles = promisify(models.BPE.fromFiles); * @default true
const modelOptions: models.BPE.BPEOptions = { */
dropout: mergedOptions.dropout, showProgress?: boolean;
endOfWordSuffix: mergedOptions.suffix, /**
unkToken: mergedOptions.unkToken * @default ["<unk>"]
}; */
specialTokens?: string[];
model = await fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, modelOptions); /**
} else { * @default "</w>"
model = models.BPE.empty(); */
} suffix?: string;
/**
const tokenizer = new Tokenizer(model); * @default 30000
return new BPETokenizer(tokenizer); */
vocabSize?: number;
} }
/** /**
* Original BPE Tokenizer. * Original BPE Tokenizer.
* Represents the BPE algorithm, as introduced by Rico Sennrich (https://arxiv.org/abs/1508.07909) * Represents the BPE algorithm, as introduced by Rico Sennrich (https://arxiv.org/abs/1508.07909)
*/ */
class BPETokenizer extends BaseTokenizer { export class BPETokenizer extends BaseTokenizer {
constructor(tokenizer: Tokenizer) { private static readonly defaultBPEOptions:
BPETokenizerOptions & Required<Pick<BPETokenizerOptions, "unkToken" | "suffix">> = {
suffix: "</w>",
unkToken: "<unk>"
};
private readonly defaultTrainOptions: Required<BPETokenizerTrainOptions> = {
initialAlphabet: [],
limitAlphabet: 1000,
minFrequency: 2,
showProgress: true,
specialTokens: ["<unk>"],
suffix: "</w>",
vocabSize: 30000
};
private constructor(tokenizer: Tokenizer) {
super(tokenizer); super(tokenizer);
} }
/**
* Instantiate and returns a new BPE tokenizer
* @param [options] Optional tokenizer options
*/
static async fromOptions(options?: BPETokenizerOptions): Promise<BPETokenizer> {
const mergedOptions = { ...this.defaultBPEOptions, ...options };
let model: Model;
if (mergedOptions.vocabFile && mergedOptions.mergesFile) {
// const fromFiles = promisify(BPE.fromFiles);
const modelOptions: bpe.BPEModelOptions = {
dropout: mergedOptions.dropout,
endOfWordSuffix: mergedOptions.suffix,
unkToken: mergedOptions.unkToken
};
model = bpe.fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, modelOptions);
// model = await fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, modelOptions);
} else {
model = bpe.empty();
}
const tokenizer = new Tokenizer(model);
const normalizer = sequenceNormalizer([nfkcNormalizer(), lowercaseNormalizer()]);
tokenizer.setNormalizer(normalizer);
tokenizer.setPreTokenizer(whitespacePreTokenizer());
const decoder = bpeDecoder(mergedOptions.suffix);
tokenizer.setDecoder(decoder);
return new BPETokenizer(tokenizer);
}
/**
* Train the model using the given files
*
* @param files Files to use for training
* @param [options] Training options
*/
async train(files: string[], options?: BPETokenizerTrainOptions): Promise<void> {
const mergedOptions = { ...this.defaultTrainOptions, ...options };
const trainer = bpeTrainer(mergedOptions);
this.tokenizer.train(trainer, files);
}
} }

View File

@ -0,0 +1,91 @@
import { BaseTokenizer } from "./base.tokenizer";
import { Tokenizer } from "../bindings/tokenizer";
import { Model, bpe } from "../bindings/models";
import { nfkcNormalizer } from "../bindings/normalizers";
import { byteLevelPreTokenizer, byteLevelAlphabet } from "../bindings/pre-tokenizers";
import { byteLevelDecoder } from "../bindings/decoders";
import { bpeTrainer } from "../bindings/trainers";
export interface ByteLevelBPETokenizerOptions {
/**
* @default false
*/
addPrefixSpace?: boolean;
mergesFile?: string;
vocabFile?: string;
}
export interface ByteLevelBPETrainOptions {
/**
* @default 2
*/
minFrequency?: number;
/**
* @default true
*/
showProgress?: boolean;
/**
* @default []
*/
specialTokens?: string[];
/**
* @default 30000
*/
vocabSize?: number;
}
/**
* Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
*/
export class ByteLevelBPETokenizer extends BaseTokenizer {
private static readonly defaultOptions:
ByteLevelBPETokenizerOptions & Required<Pick<ByteLevelBPETokenizerOptions, 'addPrefixSpace'>> = {
addPrefixSpace: false
};
private readonly defaultTrainOptions: Required<ByteLevelBPETrainOptions> = {
minFrequency: 2,
showProgress: true,
specialTokens: ['<unk>'],
vocabSize: 30000
};
private constructor(tokenizer: Tokenizer) {
super(tokenizer);
}
static async fromOptions(options?: ByteLevelBPETokenizerOptions): Promise<ByteLevelBPETokenizer> {
const mergedOptions = { ...this.defaultOptions, ...options };
let model: Model;
if (mergedOptions.vocabFile && mergedOptions.mergesFile) {
// const fromFiles = promisify(BPE.fromFiles);
model = bpe.fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile);
// model = await fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, null);
} else {
model = bpe.empty();
}
const tokenizer = new Tokenizer(model);
tokenizer.setNormalizer(nfkcNormalizer());
const preTokenizer = byteLevelPreTokenizer(mergedOptions.addPrefixSpace);
tokenizer.setPreTokenizer(preTokenizer);
tokenizer.setDecoder(byteLevelDecoder());
return new ByteLevelBPETokenizer(tokenizer);
}
/**
* Train the model using the given files
*
* @param files Files to use for training
* @param [options] Training options
*/
async train(files: string[], options?: ByteLevelBPETrainOptions): Promise<void> {
const mergedOptions = { ...this.defaultTrainOptions, ...options };
const trainer = bpeTrainer({ ...mergedOptions, initialAlphabet: byteLevelAlphabet() });
this.tokenizer.train(trainer, files);
}
}

View File

@ -1,2 +1,4 @@
export * from './bert-wordpiece.tokenizer'; export * from './bert-wordpiece.tokenizer';
export * from './bpe.tokenizer'; export * from './bpe.tokenizer';
export * from './byte-level-bpe.tokenizer';
export * from './sentence-piece.tokenizer';

View File

@ -0,0 +1,121 @@
import { BaseTokenizer } from "./base.tokenizer";
import { Tokenizer } from "../bindings/tokenizer";
import { Model, bpe } from "../bindings/models";
import { nfkcNormalizer } from "../bindings/normalizers";
import { metaspacePreTokenizer } from "../bindings/pre-tokenizers";
import { metaspaceDecoder } from "../bindings/decoders";
import { bpeTrainer } from "../bindings/trainers";
export interface SentencePieceTokenizerOptions extends OptionsWithDefaults {
dropout?: number;
mergesFile?: string;
vocabFile?: string;
}
interface OptionsWithDefaults {
/**
* @default true
*/
addPrefixSpace?: boolean;
/**
* @default "▁"
*/
replacement?: string;
/**
* @default "<unk>"
*/
unkToken?: string;
}
export interface SentencePieceTrainOptions {
/**
* @default []
*/
initialAlphabet?: string[];
/**
* @default 1000
*/
limitAlphabet?: number;
/**
* @default 2
*/
minFrequency?: number;
/**
* @default true
*/
showProgress?: boolean;
/**
* @default ["<unk>"]
*/
specialTokens?: string[];
/**
* @default 30000
*/
vocabSize?: number;
}
/**
* Represents the BPE algorithm, with the pretokenization used by SentencePiece
*/
export class SentencePieceTokenizer extends BaseTokenizer {
private static readonly defaultOptions: SentencePieceTokenizerOptions & Required<OptionsWithDefaults> = {
addPrefixSpace: true,
replacement: '▁',
unkToken: '<unk>'
};
private readonly defaultTrainOptions: Required<SentencePieceTrainOptions> = {
initialAlphabet: [],
limitAlphabet: 1000,
minFrequency: 2,
showProgress: true,
specialTokens: ['<unk>'],
vocabSize: 30000
};
private constructor(tokenizer: Tokenizer) {
super(tokenizer);
}
static async fromOptions(options?: SentencePieceTokenizerOptions): Promise<SentencePieceTokenizer> {
const mergedOptions = { ...this.defaultOptions, ...options };
let model: Model;
if (mergedOptions.vocabFile && mergedOptions.mergesFile) {
// const fromFiles = promisify(BPE.fromFiles);
const modelOptions: bpe.BPEModelOptions = {
dropout: mergedOptions.dropout,
unkToken: mergedOptions.unkToken
};
model = bpe.fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, modelOptions);
// model = await fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, null);
} else {
model = bpe.empty();
}
const tokenizer = new Tokenizer(model);
tokenizer.setNormalizer(nfkcNormalizer());
const preTokenizer = metaspacePreTokenizer(mergedOptions.replacement, mergedOptions.addPrefixSpace);
tokenizer.setPreTokenizer(preTokenizer);
const decoder = metaspaceDecoder(mergedOptions.replacement, mergedOptions.addPrefixSpace);
tokenizer.setDecoder(decoder);
return new SentencePieceTokenizer(tokenizer);
}
/**
* Train the model using the given files
*
* @param files Files to use for training
* @param [options] Training options
*/
async train(files: string[], options?: SentencePieceTrainOptions): Promise<void> {
const mergedOptions = { ...this.defaultTrainOptions, ...options };
const trainer = bpeTrainer(mergedOptions);
this.tokenizer.train(trainer, files);
}
}

View File

@ -1,2 +1 @@
export * from './bindings';
export * from './implementations'; export * from './implementations';

View File

@ -3,5 +3,5 @@ function __export(m) {
for (var p in m) if (!exports.hasOwnProperty(p)) exports[p] = m[p]; for (var p in m) if (!exports.hasOwnProperty(p)) exports[p] = m[p];
} }
Object.defineProperty(exports, "__esModule", { value: true }); Object.defineProperty(exports, "__esModule", { value: true });
__export(require("./bindings")); // export * from './bindings';
__export(require("./implementations")); __export(require("./implementations"));

View File

@ -1,2 +1,2 @@
export * from './bindings'; // export * from './bindings';
export * from './implementations'; export * from './implementations';