big big big

This commit is contained in:
Pierric Cistac
2020-01-10 14:49:13 -05:00
parent 34875d5771
commit 80f6d58177
24 changed files with 762 additions and 345 deletions

View File

@ -4,32 +4,30 @@
*/
declare class Decoder {}
export namespace decoders {
/**
* Instantiate a new ByteLevel Decoder
*/
export function ByteLevel(): Decoder;
/**
* Instantiate a new ByteLevel Decoder
*/
export function byteLevelDecoder(): Decoder;
/**
* Instantiate a new WordPiece Decoder
* @param {string} [prefix='##'] The prefix to use for subwords that are not a beginning-of-word
*/
export function WordPiece(prefix?: string): Decoder;
/**
* Instantiate a new WordPiece Decoder
* @param {string} [prefix='##'] The prefix to use for subwords that are not a beginning-of-word
*/
export function wordPieceDecoder(prefix?: string): Decoder;
/**
* Instantiate a new Metaspace
*
* @param {string} [replacement='▁'] The replacement character.
* Must be exactly one character. By default we use the `▁` (U+2581) meta symbol (same as in SentencePiece).
* @param {boolean} [addPrefixSpace=true] Whether to add a space to the first word if there isn't already one.
* This lets us treat `hello` exactly like `say hello`.
*/
export function Metaspace(replacement?: string, addPrefixSpace?: boolean): Decoder;
/**
* Instantiate a new Metaspace
*
* @param {string} [replacement='▁'] The replacement character.
* Must be exactly one character. By default we use the `▁` (U+2581) meta symbol (same as in SentencePiece).
* @param {boolean} [addPrefixSpace=true] Whether to add a space to the first word if there isn't already one.
* This lets us treat `hello` exactly like `say hello`.
*/
export function metaspaceDecoder(replacement?: string, addPrefixSpace?: boolean): Decoder;
/**
* Instantiate a new BPEDecoder
* @param {string} [suffix='</w>'] The suffix that was used to caracterize an end-of-word.
* This suffix will be replaced by whitespaces during the decoding
*/
export function BPEDecoder(suffix?: string): Decoder;
}
/**
* Instantiate a new BPE Decoder
* @param {string} [suffix='</w>'] The suffix that was used to caracterize an end-of-word.
* This suffix will be replaced by whitespaces during the decoding
*/
export function bpeDecoder(suffix?: string): Decoder;

View File

@ -0,0 +1,8 @@
var addon = require('../../native');
module.exports = {
byteLevelDecoder: addon.decoders_ByteLevel,
wordPieceDecoder: addon.decoders_WordPiece,
metaspaceDecoder: addon.decoders_Metaspace,
bpeDecoder: addon.decoders_BPEDecoder
};

View File

@ -1,7 +0,0 @@
export { decoders } from './decoders';
export { models } from './models';
export { normalizers } from './normalizers';
export { preTokenizers } from './pre-tokenizers';
export { postProcessors } from './post-processors';
export { Tokenizer } from './tokenizer';
export { trainers } from './trainers'

View File

@ -1,19 +0,0 @@
var addon = require('../../native');
exports.Tokenizer = addon.tokenizer_Tokenizer;
exports.models = {
BPE: {
fromFiles: addon.models_BPE_from_files,
empty: addon.models_BPE_empty,
},
WordPiece: addon.models_WordPiece,
}
exports.decoders = {
ByteLevel: addon.decoders_ByteLevel,
WordPiece: addon.decoders_WordPiece,
Metaspace: addon.decoders_Metaspace,
BPEDecoder: addon.decoders_BPEDecoder,
}
exports.post_processors = {
BertProcessing: addon.processors_BertProcessing,
}

View File

@ -6,73 +6,103 @@ declare class Model {
}
export namespace models {
export namespace BPE {
export interface BPEOptions {
/**
* The number of words that the BPE cache can contain. The cache allows
* to speed-up the process by keeping the result of the merge operations
* for a number of words.
*/
cacheCapacity?: number;
/**
* The BPE dropout to use. Must be an float between 0 and 1
*/
dropout?: number;
/**
* The unknown token to be used by the model
*/
unkToken?: string;
/**
* The prefix to attach to subword units that don't represent a beginning of word
*/
continuingSubwordPrefix?: string;
/**
* The suffix to attach to subword units that represent an end of word
*/
endOfWordSuffix?: string;
}
export namespace bpe {
export interface BPEModelOptions {
/**
* Instantiate a BPE model from the given vocab and merges files
*
* @param {string} vocab Path to a vocabulary JSON file
* @param {string} merges Path to a merge file
* @param {BPEOptions} [options] BPE model options
* @param {(err: any, model: Model) => void} __callback Callback called when model is loaded
* The number of words that the BPE cache can contain. The cache allows
* to speed-up the process by keeping the result of the merge operations
* for a number of words.
*/
export function fromFiles(
vocab: string,
merges: string,
options: BPEOptions | null,
__callback: (err: any, model: Model) => void
): void;
cacheCapacity?: number;
/**
* Instantiate an empty BPE Model
* The BPE dropout to use. Must be an float between 0 and 1
*/
export function empty(): Model;
dropout?: number;
/**
* The unknown token to be used by the model
*/
unkToken?: string;
/**
* The prefix to attach to subword units that don't represent a beginning of word
*/
continuingSubwordPrefix?: string;
/**
* The suffix to attach to subword units that represent an end of word
*/
endOfWordSuffix?: string;
}
export namespace WordPiece {
/**
* Instantiate a WordPiece model from the given vocab file
*
* @param {string} vocab Path to a vocabulary file
* @param {string} [unkToken] The unknown token to be used by the model
* @param {number} [maxInputCharsPerWord] The maximum number of characters to authorize in a single word
* @param {(err: any, model: Model) => void} __callback Callback called when model is loaded
*/
export function fromFiles(
vocab: string,
unkToken: string,
maxInputCharsPerWord: number | null,
__callback: (err: any, model: Model) => void
): void;
/**
* Instantiate a BPE model from the given vocab and merges files
*
* @param vocab Path to a vocabulary JSON file
* @param merges Path to a merge file
* @param [options] BPE model options
*/
export function fromFiles(
vocab: string,
merges: string,
options?: BPEModelOptions
): Model;
/**
* Instantiate an empty WordPiece model
*/
export function empty(): Model;
}
/**
* Instantiate a BPE model from the given vocab and merges files
*
* @param vocab Path to a vocabulary JSON file
* @param merges Path to a merge file
* @param options BPE model options
* @param __callback Callback called when model is loaded
*/
// export function fromFiles(
// vocab: string,
// merges: string,
// options: BPEModelOptions | null,
// __callback: (err: any, model: Model) => void
// ): void;
/**
* Instantiate an empty BPE Model
*/
export function empty(): Model;
}
export namespace wordPiece {
export interface WordPieceModelOptions {
/**
* The maximum number of characters to authorize in a single word.
* @default 100
*/
maxInputCharsPerWord?: number;
/**
* The unknown token to be used by the model.
* @default "[UNK]"
*/
unkToken?: string;
}
/**
* Instantiate a WordPiece model from the given vocab file
*
* @param {string} vocab Path to a vocabulary file
* @param [options] WordPiece model options
*/
export function fromFiles(vocab: string, options?: WordPieceModelOptions): Model;
/**
* Instantiate a WordPiece model from the given vocab file
*
* @param vocab Path to a vocabulary file
* @param options WordPiece model options
* @param __callback Callback called when model is loaded
*/
// export function fromFiles(
// vocab: string,
// options: WordPieceModelOptions | null,
// __callback: (err: any, model: Model) => void
// ): void;
/**
* Instantiate an empty WordPiece model
*/
export function empty(): Model;
}

View File

@ -0,0 +1,12 @@
var addon = require('../../native');
module.exports = {
bpe: {
fromFiles: addon.models_BPE_from_files,
empty: addon.models_BPE_empty,
},
wordPiece: {
fromFiles: addon.models_WordPiece_from_files,
empty: addon.models_WordPiece_empty,
}
}

View File

@ -4,68 +4,66 @@
*/
declare class Normalizer {}
export namespace normalizers {
interface BertNormalizerOptions {
/**
* Whether to clean the text, by removing any control characters
* and replacing all whitespaces by the classic one.
* @default true
*/
cleanText?: boolean;
/**
* Whether to handle chinese chars by putting spaces around them.
* @default true
*/
handleChineseChars?: boolean;
/**
* Whether to lowercase.
* @default true
*/
lowercase?: boolean;
/**
* Whether to strip all accents.
* @default true
*/
stripAccents?: boolean;
}
interface BertNormalizerOptions {
/**
* Instantiate a Bert Normalizer with the given options
*
* @param [options] Normalizer options
* @returns {Normalizer} Bert Normalizer. Takes care of normalizing raw text before giving it to a Bert model.
* This includes cleaning the text, handling accents, chinese chars and lowercasing
* Whether to clean the text, by removing any control characters
* and replacing all whitespaces by the classic one.
* @default true
*/
export function bertNormalizer(options?: BertNormalizerOptions): Normalizer;
cleanText?: boolean;
/**
* Returns a new NFD Unicode Normalizer
* Whether to handle chinese chars by putting spaces around them.
* @default true
*/
export function nfd(): Normalizer;
handleChineseChars?: boolean;
/**
* Returns a new NFKD Unicode Normalizer
* Whether to lowercase.
* @default true
*/
export function nfkd(): Normalizer;
lowercase?: boolean;
/**
* Returns a new NFC Unicode Normalizer
* Whether to strip all accents.
* @default true
*/
export function nfc(): Normalizer;
/**
* Returns a new NFKC Unicode Normalizer
*/
export function nfkc(): Normalizer;
/**
* Instantiate a new Normalization Sequence using the given normalizers
* @param normalizers A list of Normalizer to be run as a sequence
*/
export function sequence(normalizers: Normalizer[]): Normalizer;
/**
* Returns a new Lowercase Normalizer
*/
export function lowercase(): Normalizer;
stripAccents?: boolean;
}
/**
* Instantiate a Bert Normalizer with the given options
*
* @param [options] Normalizer options
* @returns {Normalizer} Bert Normalizer. Takes care of normalizing raw text before giving it to a Bert model.
* This includes cleaning the text, handling accents, chinese chars and lowercasing
*/
export function bertNormalizer(options?: BertNormalizerOptions): Normalizer;
/**
* Returns a new NFD Unicode Normalizer
*/
export function nfdNormalizer(): Normalizer;
/**
* Returns a new NFKD Unicode Normalizer
*/
export function nfkdNormalizer(): Normalizer;
/**
* Returns a new NFC Unicode Normalizer
*/
export function nfcNormalizer(): Normalizer;
/**
* Returns a new NFKC Unicode Normalizer
*/
export function nfkcNormalizer(): Normalizer;
/**
* Instantiate a new Normalization Sequence using the given normalizers
* @param normalizers A list of Normalizer to be run as a sequence
*/
export function sequenceNormalizer(normalizers: Normalizer[]): Normalizer;
/**
* Returns a new Lowercase Normalizer
*/
export function lowercaseNormalizer(): Normalizer;

View File

@ -0,0 +1,11 @@
var addon = require('../../native');
module.exports = {
bertNormalizer: addon.normalizers_BertNormalizer,
nfdNormalizer: addon.normalizers_NFD,
nfkdNormalizer: addon.normalizers_NFKD,
nfcNormalizer: addon.normalizers_NFC,
nfkcNormalizer: addon.normalizers_NFKC,
sequenceNormalizer: addon.normalizers_Sequence,
lowercaseNormalizer: addon.normalizers_Lowercase
};

View File

@ -4,12 +4,10 @@
*/
declare class PostProcessor {}
export namespace postProcessors {
/**
* Instantiate a new BertProcessing with the given tokens
*
* @param {[string, number]} sep A tuple with the string representation of the SEP token, and its id
* @param {[string, number]} cls A tuple with the string representation of the CLS token, and its id
*/
export function BertProcessing(sep: [string, number], cls: [string, number]): PostProcessor;
}
/**
* Instantiate a new BertProcessing with the given tokens
*
* @param {[string, number]} sep A tuple with the string representation of the SEP token, and its id
* @param {[string, number]} cls A tuple with the string representation of the CLS token, and its id
*/
export function bertProcessing(sep: [string, number], cls: [string, number]): PostProcessor;

View File

@ -0,0 +1,5 @@
var addon = require('../../native');
module.exports = {
bertProcessing: addon.processors_BertProcessing
};

View File

@ -4,50 +4,46 @@
*/
declare class PreTokenizer {}
export namespace preTokenizers {
export namespace byteLevel {
/**
* Instantiate a new ByteLevel PreTokenizer
*
* @param {boolean} [addPrefixSpace=true] Whether to add a space to the first word if there isn't already one.
* This lets us treat `hello` exactly like `say hello`.
* @returns {PreTokenizer} ByteLevel PreTokenizer.
* This pre-tokenizer takes care of replacing all bytes of the given string
* with a corresponding representation, as well as splitting into words.
*/
export function byteLevel(addPrefixSpace?: boolean): PreTokenizer;
/**
* Returns the alphabet used by the ByteLevel PreTokenizer.
* Since the ByteLevel works as its name suggests, at the byte level, it
* encodes any byte to one visible character. This means that there is a
* total of 256 different characters composing this alphabet.
*/
export function byteLevelAlphabet(): string[];
}
/**
* Instantiate a new ByteLevel PreTokenizer
*
* @param {boolean} [addPrefixSpace=true] Whether to add a space to the first word if there isn't already one.
* This lets us treat `hello` exactly like `say hello`.
* @returns {PreTokenizer} ByteLevel PreTokenizer.
* This pre-tokenizer takes care of replacing all bytes of the given string
* with a corresponding representation, as well as splitting into words.
*/
export function byteLevelPreTokenizer(addPrefixSpace?: boolean): PreTokenizer;
/**
* Returns a Whitespace PreTokenizer
* This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
*/
export function whitespace(): PreTokenizer;
/**
* Returns the alphabet used by the ByteLevel PreTokenizer.
* Since the ByteLevel works as its name suggests, at the byte level, it
* encodes any byte to one visible character. This means that there is a
* total of 256 different characters composing this alphabet.
*/
export function byteLevelAlphabet(): string[];
/**
* Returns a new Bert PreTokenizer.
* This pre-tokenizer splits tokens on spaces, and also on punctuation.
* Each occurence of a punctuation character will be treated separately.
*/
export function bertPreTokenizer(): PreTokenizer;
/**
* Returns a Whitespace PreTokenizer
* This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
*/
export function whitespacePreTokenizer(): PreTokenizer;
/**
* Returns a new Metaspace Tokenizer.
* This pre-tokenizer replaces any whitespace by the provided replacement character.
* It then tries to split on these spaces.
*
* @param {string} [replacement="▁"] The replacement character. Must be exactly one character.
* By default we use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
* @param {boolean} [addPrefixSpace] Whether to add a space to the first word if there isn't already one.
* This lets us treat `hello` exactly like `say hello`.
*/
export function metaspace(replacement?: string, addPrefixSpace?: boolean): PreTokenizer;
}
/**
* Returns a new Bert PreTokenizer.
* This pre-tokenizer splits tokens on spaces, and also on punctuation.
* Each occurence of a punctuation character will be treated separately.
*/
export function bertPreTokenizer(): PreTokenizer;
/**
* Returns a new Metaspace Tokenizer.
* This pre-tokenizer replaces any whitespace by the provided replacement character.
* It then tries to split on these spaces.
*
* @param {string} [replacement="▁"] The replacement character. Must be exactly one character.
* By default we use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
* @param {boolean} [addPrefixSpace] Whether to add a space to the first word if there isn't already one.
* This lets us treat `hello` exactly like `say hello`.
*/
export function metaspacePreTokenizer(replacement?: string, addPrefixSpace?: boolean): PreTokenizer;

View File

@ -0,0 +1,9 @@
var addon = require('../../native');
module.exports = {
byteLevelPreTokenizer: addon.pre_tokenizers_ByteLevel,
byteLevelAlphabet: addon.pre_tokenizers_ByteLevel_Alphabet,
whitespacePreTokenizer: addon.pre_tokenizers_Whitespace,
bertPreTokenizer: addon.pre_tokenizers_BertPreTokenizer,
metaspacePreTokenizer: addon.pre_tokenizers_Metaspace
};

View File

@ -0,0 +1,3 @@
var addon = require('../../native');
module.exports.Tokenizer = addon.tokenizer_Tokenizer;

View File

@ -4,59 +4,57 @@
*/
declare class Trainer {}
export namespace trainers {
interface TrainerOptions {
/**
* A prefix to be used for every subword that is not a beginning-of-word.
*/
continuingSubwordPrefix?: string;
/**
* A suffix to be used for every subword that is a end-of-word.
*/
endOfWordSuffix?: string;
/**
* A list of characters to include in the initial alphabet, even
* if not seen in the training dataset.
* If the strings contains more than one character, only the first one
* is kept.
* @default []
*/
initialAlphabet?: string[];
/**
* The maximum different characters to keep in the alphabet.
*/
limitAlphabet?: number;
/**
* The minimum frequency a pair should have in order to be merged.
* @default 2
*/
minFrequency?: number;
/**
* Whether to show progress bars while training.
* @default true
*/
showProgress?: boolean;
/**
* A list of special tokens the model should know of.
* @default []
*/
specialTokens?: string[];
/**
* The size of the final vocabulary, including all tokens and alphabet.
* @default 30000
*/
vocabSize?: number;
}
interface TrainerOptions {
/**
* Instantiate a new BPE Trainer
* @param {TrainerOptions} [options] BPE Trainer options
* A prefix to be used for every subword that is not a beginning-of-word.
*/
export function bpeTrainer(options?: TrainerOptions): Trainer;
continuingSubwordPrefix?: string;
/**
* Instantiate a new WordPiece Trainer
* @param {TrainerOptions} [options] WordPiece Trainer options
* A suffix to be used for every subword that is a end-of-word.
*/
export function wordPieceTrainer(options?: TrainerOptions): Trainer;
endOfWordSuffix?: string;
/**
* A list of characters to include in the initial alphabet, even
* if not seen in the training dataset.
* If the strings contains more than one character, only the first one
* is kept.
* @default []
*/
initialAlphabet?: string[];
/**
* The maximum different characters to keep in the alphabet.
*/
limitAlphabet?: number;
/**
* The minimum frequency a pair should have in order to be merged.
* @default 2
*/
minFrequency?: number;
/**
* Whether to show progress bars while training.
* @default true
*/
showProgress?: boolean;
/**
* A list of special tokens the model should know of.
* @default []
*/
specialTokens?: string[];
/**
* The size of the final vocabulary, including all tokens and alphabet.
* @default 30000
*/
vocabSize?: number;
}
/**
* Instantiate a new BPE Trainer
* @param {TrainerOptions} [options] BPE Trainer options
*/
export function bpeTrainer(options?: TrainerOptions): Trainer;
/**
* Instantiate a new WordPiece Trainer
* @param {TrainerOptions} [options] WordPiece Trainer options
*/
export function wordPieceTrainer(options?: TrainerOptions): Trainer;

View File

@ -0,0 +1,6 @@
var addon = require('../../native');
module.exports = {
bpeTrainer: addon.trainers_BPETrainer,
wordPieceTrainer: addon.trainers_WordPieceTrainer
};

View File

@ -2,7 +2,7 @@ import { promisify } from "util";
import { Encoding, Tokenizer } from "../bindings/tokenizer";
export class BaseTokenizer {
constructor(private tokenizer: Tokenizer) {}
constructor(protected tokenizer: Tokenizer) {}
/**
* Encode the given sequence
@ -11,7 +11,7 @@ export class BaseTokenizer {
* @param {(string | null)} pair The optional pair sequence
*/
async encode(sequence: string, pair?: string): Promise<Encoding> {
const encode = promisify(this.tokenizer.encode);
const encode = promisify(this.tokenizer.encode.bind(this.tokenizer));
return encode(sequence, pair ?? null);
}
@ -22,7 +22,7 @@ export class BaseTokenizer {
* The list can contain both at the same time.
*/
async encodeBatch(sequences: (string | [string, string])[]): Promise<Encoding[]> {
const encodeBatch = promisify(this.tokenizer.encodeBatch);
const encodeBatch = promisify(this.tokenizer.encodeBatch.bind(this.tokenizer));
return encodeBatch(sequences);
}
}

View File

@ -1,9 +1,14 @@
import { promisify } from "util";
import { BaseTokenizer } from "./base.tokenizer";
import { Tokenizer } from "../bindings/tokenizer";
import { Model, models } from "../bindings/models";
import { Model, wordPiece } from "../bindings/models";
import { bertNormalizer } from "../bindings/normalizers";
import { bertPreTokenizer } from "../bindings/pre-tokenizers";
import { bertProcessing } from "../bindings/post-processors";
import { wordPieceDecoder } from "../bindings/decoders";
import { wordPieceTrainer } from "../bindings/trainers";
interface BertWordpieceOptions {
export interface BertWordPieceOptions {
/**
* @default true
*/
@ -43,42 +48,121 @@ interface BertWordpieceOptions {
wordpiecesPrefix?: string;
}
const defaultBertOptions: Required<Omit<BertWordpieceOptions, 'vocabFile'>> & { vocabFile?: string } = {
addSpecialTokens: true,
cleanText: true,
clsToken: '[CLS]',
handleChineseChars: true,
lowercase: true,
sepToken: '[SEP]',
stripAccents: true,
unkToken: '[UNK]',
wordpiecesPrefix: '##'
};
/**
* Instantiate and returns a new Bert WordPiece tokenizer
* @param options
*/
export async function getBertWordpieceTokenizer(options?: BertWordpieceOptions): Promise<BertWordpieceTokenizer> {
const mergedOptions = { ...defaultBertOptions, ...options };
let model: Model;
if (mergedOptions.vocabFile) {
const fromFiles = promisify(models.WordPiece.fromFiles);
model = await fromFiles(mergedOptions.vocabFile, mergedOptions.unkToken, null);
} else {
model = models.WordPiece.empty();
}
const tokenizer = new Tokenizer(model);
return new BertWordpieceTokenizer(tokenizer);
export interface BertWordPieceTrainOptions {
/**
* @default []
*/
initialAlphabet?: string[];
/**
* @default 1000
*/
limitAlphabet?: number;
/**
* @default 2
*/
minFrequency?: number;
/**
* @default true
*/
showProgress?: boolean;
/**
* @default ["[UNK]", "[SEP]", "[CLS]"]
*/
specialTokens?: string[];
/**
* @default 30000
*/
vocabSize?: number;
/**
* @default "##"
*/
wordpiecesPrefix?: string;
}
/**
* Bert WordPiece Tokenizer
*/
class BertWordpieceTokenizer extends BaseTokenizer {
constructor(tokenizer: Tokenizer) {
export class BertWordPieceTokenizer extends BaseTokenizer {
private static readonly defaultBertOptions:
Required<Omit<BertWordPieceOptions, "vocabFile">> & { vocabFile?: string } = {
addSpecialTokens: true,
cleanText: true,
clsToken: "[CLS]",
handleChineseChars: true,
lowercase: true,
sepToken: "[SEP]",
stripAccents: true,
unkToken: "[UNK]",
wordpiecesPrefix: "##"
};
private readonly defaultTrainOptions: Required<BertWordPieceTrainOptions> = {
initialAlphabet: [],
limitAlphabet: 1000,
minFrequency: 2,
showProgress: true,
specialTokens: ['<unk>'],
vocabSize: 30000,
wordpiecesPrefix: "##"
};
private constructor(tokenizer: Tokenizer) {
super(tokenizer);
}
/**
* Instantiate and returns a new Bert WordPiece tokenizer
* @param [options] Optional tokenizer options
*/
static async fromOptions(options?: BertWordPieceOptions): Promise<BertWordPieceTokenizer> {
const mergedOptions = { ...this.defaultBertOptions, ...options };
let model: Model;
if (mergedOptions.vocabFile) {
// const fromFiles = promisify(WordPiece.fromFiles);
model = wordPiece.fromFiles(mergedOptions.vocabFile, { unkToken: mergedOptions.unkToken });
// model = await fromFiles(mergedOptions.vocabFile, mergedOptions.unkToken, null);
} else {
model = wordPiece.empty();
}
const tokenizer = new Tokenizer(model);
const normalizer = bertNormalizer(mergedOptions);
tokenizer.setNormalizer(normalizer);
tokenizer.setPreTokenizer(bertPreTokenizer());
const sepTokenId = tokenizer.tokenToId(mergedOptions.sepToken);
if (sepTokenId === undefined) {
throw new Error("sepToken not found in the vocabulary");
}
const clsTokenId = tokenizer.tokenToId(mergedOptions.clsToken);
if (clsTokenId === undefined) {
throw new Error("clsToken not found in the vocabulary");
}
if (mergedOptions.addSpecialTokens) {
const processor = bertProcessing([mergedOptions.sepToken, sepTokenId], [mergedOptions.clsToken, clsTokenId]);
tokenizer.setPostProcessor(processor);
}
const decoder = wordPieceDecoder(mergedOptions.wordpiecesPrefix);
tokenizer.setDecoder(decoder);
return new BertWordPieceTokenizer(tokenizer);
}
/**
* Train the model using the given files
*
* @param files Files to use for training
* @param [options] Training options
*/
async train(files: string[], options?: BertWordPieceTrainOptions): Promise<void> {
const mergedOptions = { ...this.defaultTrainOptions, ...options };
const trainer = wordPieceTrainer(mergedOptions);
this.tokenizer.train(trainer, files);
}
}

View File

@ -1,52 +1,126 @@
import { promisify } from "util";
import { BaseTokenizer } from "./base.tokenizer";
import { Model, models } from "../bindings/models";
import { Model, bpe } from "../bindings/models";
import { Tokenizer } from "../bindings/tokenizer";
import { sequenceNormalizer, nfkcNormalizer, lowercaseNormalizer } from "../bindings/normalizers";
import { whitespacePreTokenizer } from "../bindings/pre-tokenizers";
import { bpeDecoder } from "../bindings/decoders";
import { bpeTrainer } from "../bindings/trainers";
interface BPEOptions {
export interface BPETokenizerOptions {
dropout?: number;
mergesFile?: string;
/**
* @default "</w>"
*/
suffix?: string;
/**
* @default "<unk>"
*/
unkToken?: string;
vocabFile?: string;
}
const defaultBPEOptions: BPEOptions & Required<Pick<BPEOptions, 'unkToken' | 'suffix'>> = {
suffix: '</w>',
unkToken: '<unk>'
};
/**
* Instantiate and returns a new BPE tokenizer
* @param options
*/
export async function getBPETokenizer(options?: BPEOptions): Promise<BPETokenizer> {
const mergedOptions = { ...defaultBPEOptions, ...options };
let model: Model;
if (mergedOptions.vocabFile && mergedOptions.mergesFile) {
const fromFiles = promisify(models.BPE.fromFiles);
const modelOptions: models.BPE.BPEOptions = {
dropout: mergedOptions.dropout,
endOfWordSuffix: mergedOptions.suffix,
unkToken: mergedOptions.unkToken
};
model = await fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, modelOptions);
} else {
model = models.BPE.empty();
}
const tokenizer = new Tokenizer(model);
return new BPETokenizer(tokenizer);
export interface BPETokenizerTrainOptions {
/**
* @default []
*/
initialAlphabet?: string[];
/**
* @default 1000
*/
limitAlphabet?: number;
/**
* @default 2
*/
minFrequency?: number;
/**
* @default true
*/
showProgress?: boolean;
/**
* @default ["<unk>"]
*/
specialTokens?: string[];
/**
* @default "</w>"
*/
suffix?: string;
/**
* @default 30000
*/
vocabSize?: number;
}
/**
* Original BPE Tokenizer.
* Represents the BPE algorithm, as introduced by Rico Sennrich (https://arxiv.org/abs/1508.07909)
*/
class BPETokenizer extends BaseTokenizer {
constructor(tokenizer: Tokenizer) {
export class BPETokenizer extends BaseTokenizer {
private static readonly defaultBPEOptions:
BPETokenizerOptions & Required<Pick<BPETokenizerOptions, "unkToken" | "suffix">> = {
suffix: "</w>",
unkToken: "<unk>"
};
private readonly defaultTrainOptions: Required<BPETokenizerTrainOptions> = {
initialAlphabet: [],
limitAlphabet: 1000,
minFrequency: 2,
showProgress: true,
specialTokens: ["<unk>"],
suffix: "</w>",
vocabSize: 30000
};
private constructor(tokenizer: Tokenizer) {
super(tokenizer);
}
/**
* Instantiate and returns a new BPE tokenizer
* @param [options] Optional tokenizer options
*/
static async fromOptions(options?: BPETokenizerOptions): Promise<BPETokenizer> {
const mergedOptions = { ...this.defaultBPEOptions, ...options };
let model: Model;
if (mergedOptions.vocabFile && mergedOptions.mergesFile) {
// const fromFiles = promisify(BPE.fromFiles);
const modelOptions: bpe.BPEModelOptions = {
dropout: mergedOptions.dropout,
endOfWordSuffix: mergedOptions.suffix,
unkToken: mergedOptions.unkToken
};
model = bpe.fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, modelOptions);
// model = await fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, modelOptions);
} else {
model = bpe.empty();
}
const tokenizer = new Tokenizer(model);
const normalizer = sequenceNormalizer([nfkcNormalizer(), lowercaseNormalizer()]);
tokenizer.setNormalizer(normalizer);
tokenizer.setPreTokenizer(whitespacePreTokenizer());
const decoder = bpeDecoder(mergedOptions.suffix);
tokenizer.setDecoder(decoder);
return new BPETokenizer(tokenizer);
}
/**
* Train the model using the given files
*
* @param files Files to use for training
* @param [options] Training options
*/
async train(files: string[], options?: BPETokenizerTrainOptions): Promise<void> {
const mergedOptions = { ...this.defaultTrainOptions, ...options };
const trainer = bpeTrainer(mergedOptions);
this.tokenizer.train(trainer, files);
}
}

View File

@ -0,0 +1,91 @@
import { BaseTokenizer } from "./base.tokenizer";
import { Tokenizer } from "../bindings/tokenizer";
import { Model, bpe } from "../bindings/models";
import { nfkcNormalizer } from "../bindings/normalizers";
import { byteLevelPreTokenizer, byteLevelAlphabet } from "../bindings/pre-tokenizers";
import { byteLevelDecoder } from "../bindings/decoders";
import { bpeTrainer } from "../bindings/trainers";
export interface ByteLevelBPETokenizerOptions {
/**
* @default false
*/
addPrefixSpace?: boolean;
mergesFile?: string;
vocabFile?: string;
}
export interface ByteLevelBPETrainOptions {
/**
* @default 2
*/
minFrequency?: number;
/**
* @default true
*/
showProgress?: boolean;
/**
* @default []
*/
specialTokens?: string[];
/**
* @default 30000
*/
vocabSize?: number;
}
/**
* Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
*/
export class ByteLevelBPETokenizer extends BaseTokenizer {
private static readonly defaultOptions:
ByteLevelBPETokenizerOptions & Required<Pick<ByteLevelBPETokenizerOptions, 'addPrefixSpace'>> = {
addPrefixSpace: false
};
private readonly defaultTrainOptions: Required<ByteLevelBPETrainOptions> = {
minFrequency: 2,
showProgress: true,
specialTokens: ['<unk>'],
vocabSize: 30000
};
private constructor(tokenizer: Tokenizer) {
super(tokenizer);
}
static async fromOptions(options?: ByteLevelBPETokenizerOptions): Promise<ByteLevelBPETokenizer> {
const mergedOptions = { ...this.defaultOptions, ...options };
let model: Model;
if (mergedOptions.vocabFile && mergedOptions.mergesFile) {
// const fromFiles = promisify(BPE.fromFiles);
model = bpe.fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile);
// model = await fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, null);
} else {
model = bpe.empty();
}
const tokenizer = new Tokenizer(model);
tokenizer.setNormalizer(nfkcNormalizer());
const preTokenizer = byteLevelPreTokenizer(mergedOptions.addPrefixSpace);
tokenizer.setPreTokenizer(preTokenizer);
tokenizer.setDecoder(byteLevelDecoder());
return new ByteLevelBPETokenizer(tokenizer);
}
/**
* Train the model using the given files
*
* @param files Files to use for training
* @param [options] Training options
*/
async train(files: string[], options?: ByteLevelBPETrainOptions): Promise<void> {
const mergedOptions = { ...this.defaultTrainOptions, ...options };
const trainer = bpeTrainer({ ...mergedOptions, initialAlphabet: byteLevelAlphabet() });
this.tokenizer.train(trainer, files);
}
}

View File

@ -1,2 +1,4 @@
export * from './bert-wordpiece.tokenizer';
export * from './bpe.tokenizer';
export * from './byte-level-bpe.tokenizer';
export * from './sentence-piece.tokenizer';

View File

@ -0,0 +1,121 @@
import { BaseTokenizer } from "./base.tokenizer";
import { Tokenizer } from "../bindings/tokenizer";
import { Model, bpe } from "../bindings/models";
import { nfkcNormalizer } from "../bindings/normalizers";
import { metaspacePreTokenizer } from "../bindings/pre-tokenizers";
import { metaspaceDecoder } from "../bindings/decoders";
import { bpeTrainer } from "../bindings/trainers";
export interface SentencePieceTokenizerOptions extends OptionsWithDefaults {
dropout?: number;
mergesFile?: string;
vocabFile?: string;
}
interface OptionsWithDefaults {
/**
* @default true
*/
addPrefixSpace?: boolean;
/**
* @default "▁"
*/
replacement?: string;
/**
* @default "<unk>"
*/
unkToken?: string;
}
export interface SentencePieceTrainOptions {
/**
* @default []
*/
initialAlphabet?: string[];
/**
* @default 1000
*/
limitAlphabet?: number;
/**
* @default 2
*/
minFrequency?: number;
/**
* @default true
*/
showProgress?: boolean;
/**
* @default ["<unk>"]
*/
specialTokens?: string[];
/**
* @default 30000
*/
vocabSize?: number;
}
/**
* Represents the BPE algorithm, with the pretokenization used by SentencePiece
*/
export class SentencePieceTokenizer extends BaseTokenizer {
private static readonly defaultOptions: SentencePieceTokenizerOptions & Required<OptionsWithDefaults> = {
addPrefixSpace: true,
replacement: '▁',
unkToken: '<unk>'
};
private readonly defaultTrainOptions: Required<SentencePieceTrainOptions> = {
initialAlphabet: [],
limitAlphabet: 1000,
minFrequency: 2,
showProgress: true,
specialTokens: ['<unk>'],
vocabSize: 30000
};
private constructor(tokenizer: Tokenizer) {
super(tokenizer);
}
static async fromOptions(options?: SentencePieceTokenizerOptions): Promise<SentencePieceTokenizer> {
const mergedOptions = { ...this.defaultOptions, ...options };
let model: Model;
if (mergedOptions.vocabFile && mergedOptions.mergesFile) {
// const fromFiles = promisify(BPE.fromFiles);
const modelOptions: bpe.BPEModelOptions = {
dropout: mergedOptions.dropout,
unkToken: mergedOptions.unkToken
};
model = bpe.fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, modelOptions);
// model = await fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, null);
} else {
model = bpe.empty();
}
const tokenizer = new Tokenizer(model);
tokenizer.setNormalizer(nfkcNormalizer());
const preTokenizer = metaspacePreTokenizer(mergedOptions.replacement, mergedOptions.addPrefixSpace);
tokenizer.setPreTokenizer(preTokenizer);
const decoder = metaspaceDecoder(mergedOptions.replacement, mergedOptions.addPrefixSpace);
tokenizer.setDecoder(decoder);
return new SentencePieceTokenizer(tokenizer);
}
/**
* Train the model using the given files
*
* @param files Files to use for training
* @param [options] Training options
*/
async train(files: string[], options?: SentencePieceTrainOptions): Promise<void> {
const mergedOptions = { ...this.defaultTrainOptions, ...options };
const trainer = bpeTrainer(mergedOptions);
this.tokenizer.train(trainer, files);
}
}

View File

@ -1,2 +1 @@
export * from './bindings';
export * from './implementations';

View File

@ -3,5 +3,5 @@ function __export(m) {
for (var p in m) if (!exports.hasOwnProperty(p)) exports[p] = m[p];
}
Object.defineProperty(exports, "__esModule", { value: true });
__export(require("./bindings"));
// export * from './bindings';
__export(require("./implementations"));

View File

@ -1,2 +1,2 @@
export * from './bindings';
// export * from './bindings';
export * from './implementations';