mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
big big big
This commit is contained in:
50
bindings/node/lib/bindings/decoders.d.ts
vendored
50
bindings/node/lib/bindings/decoders.d.ts
vendored
@ -4,32 +4,30 @@
|
||||
*/
|
||||
declare class Decoder {}
|
||||
|
||||
export namespace decoders {
|
||||
/**
|
||||
* Instantiate a new ByteLevel Decoder
|
||||
*/
|
||||
export function ByteLevel(): Decoder;
|
||||
/**
|
||||
* Instantiate a new ByteLevel Decoder
|
||||
*/
|
||||
export function byteLevelDecoder(): Decoder;
|
||||
|
||||
/**
|
||||
* Instantiate a new WordPiece Decoder
|
||||
* @param {string} [prefix='##'] The prefix to use for subwords that are not a beginning-of-word
|
||||
*/
|
||||
export function WordPiece(prefix?: string): Decoder;
|
||||
/**
|
||||
* Instantiate a new WordPiece Decoder
|
||||
* @param {string} [prefix='##'] The prefix to use for subwords that are not a beginning-of-word
|
||||
*/
|
||||
export function wordPieceDecoder(prefix?: string): Decoder;
|
||||
|
||||
/**
|
||||
* Instantiate a new Metaspace
|
||||
*
|
||||
* @param {string} [replacement='▁'] The replacement character.
|
||||
* Must be exactly one character. By default we use the `▁` (U+2581) meta symbol (same as in SentencePiece).
|
||||
* @param {boolean} [addPrefixSpace=true] Whether to add a space to the first word if there isn't already one.
|
||||
* This lets us treat `hello` exactly like `say hello`.
|
||||
*/
|
||||
export function Metaspace(replacement?: string, addPrefixSpace?: boolean): Decoder;
|
||||
/**
|
||||
* Instantiate a new Metaspace
|
||||
*
|
||||
* @param {string} [replacement='▁'] The replacement character.
|
||||
* Must be exactly one character. By default we use the `▁` (U+2581) meta symbol (same as in SentencePiece).
|
||||
* @param {boolean} [addPrefixSpace=true] Whether to add a space to the first word if there isn't already one.
|
||||
* This lets us treat `hello` exactly like `say hello`.
|
||||
*/
|
||||
export function metaspaceDecoder(replacement?: string, addPrefixSpace?: boolean): Decoder;
|
||||
|
||||
/**
|
||||
* Instantiate a new BPEDecoder
|
||||
* @param {string} [suffix='</w>'] The suffix that was used to caracterize an end-of-word.
|
||||
* This suffix will be replaced by whitespaces during the decoding
|
||||
*/
|
||||
export function BPEDecoder(suffix?: string): Decoder;
|
||||
}
|
||||
/**
|
||||
* Instantiate a new BPE Decoder
|
||||
* @param {string} [suffix='</w>'] The suffix that was used to caracterize an end-of-word.
|
||||
* This suffix will be replaced by whitespaces during the decoding
|
||||
*/
|
||||
export function bpeDecoder(suffix?: string): Decoder;
|
||||
|
8
bindings/node/lib/bindings/decoders.js
Normal file
8
bindings/node/lib/bindings/decoders.js
Normal file
@ -0,0 +1,8 @@
|
||||
var addon = require('../../native');
|
||||
|
||||
module.exports = {
|
||||
byteLevelDecoder: addon.decoders_ByteLevel,
|
||||
wordPieceDecoder: addon.decoders_WordPiece,
|
||||
metaspaceDecoder: addon.decoders_Metaspace,
|
||||
bpeDecoder: addon.decoders_BPEDecoder
|
||||
};
|
7
bindings/node/lib/bindings/index.d.ts
vendored
7
bindings/node/lib/bindings/index.d.ts
vendored
@ -1,7 +0,0 @@
|
||||
export { decoders } from './decoders';
|
||||
export { models } from './models';
|
||||
export { normalizers } from './normalizers';
|
||||
export { preTokenizers } from './pre-tokenizers';
|
||||
export { postProcessors } from './post-processors';
|
||||
export { Tokenizer } from './tokenizer';
|
||||
export { trainers } from './trainers'
|
@ -1,19 +0,0 @@
|
||||
var addon = require('../../native');
|
||||
|
||||
exports.Tokenizer = addon.tokenizer_Tokenizer;
|
||||
exports.models = {
|
||||
BPE: {
|
||||
fromFiles: addon.models_BPE_from_files,
|
||||
empty: addon.models_BPE_empty,
|
||||
},
|
||||
WordPiece: addon.models_WordPiece,
|
||||
}
|
||||
exports.decoders = {
|
||||
ByteLevel: addon.decoders_ByteLevel,
|
||||
WordPiece: addon.decoders_WordPiece,
|
||||
Metaspace: addon.decoders_Metaspace,
|
||||
BPEDecoder: addon.decoders_BPEDecoder,
|
||||
}
|
||||
exports.post_processors = {
|
||||
BertProcessing: addon.processors_BertProcessing,
|
||||
}
|
154
bindings/node/lib/bindings/models.d.ts
vendored
154
bindings/node/lib/bindings/models.d.ts
vendored
@ -6,73 +6,103 @@ declare class Model {
|
||||
|
||||
}
|
||||
|
||||
export namespace models {
|
||||
export namespace BPE {
|
||||
export interface BPEOptions {
|
||||
/**
|
||||
* The number of words that the BPE cache can contain. The cache allows
|
||||
* to speed-up the process by keeping the result of the merge operations
|
||||
* for a number of words.
|
||||
*/
|
||||
cacheCapacity?: number;
|
||||
/**
|
||||
* The BPE dropout to use. Must be an float between 0 and 1
|
||||
*/
|
||||
dropout?: number;
|
||||
/**
|
||||
* The unknown token to be used by the model
|
||||
*/
|
||||
unkToken?: string;
|
||||
/**
|
||||
* The prefix to attach to subword units that don't represent a beginning of word
|
||||
*/
|
||||
continuingSubwordPrefix?: string;
|
||||
/**
|
||||
* The suffix to attach to subword units that represent an end of word
|
||||
*/
|
||||
endOfWordSuffix?: string;
|
||||
}
|
||||
|
||||
export namespace bpe {
|
||||
export interface BPEModelOptions {
|
||||
/**
|
||||
* Instantiate a BPE model from the given vocab and merges files
|
||||
*
|
||||
* @param {string} vocab Path to a vocabulary JSON file
|
||||
* @param {string} merges Path to a merge file
|
||||
* @param {BPEOptions} [options] BPE model options
|
||||
* @param {(err: any, model: Model) => void} __callback Callback called when model is loaded
|
||||
* The number of words that the BPE cache can contain. The cache allows
|
||||
* to speed-up the process by keeping the result of the merge operations
|
||||
* for a number of words.
|
||||
*/
|
||||
export function fromFiles(
|
||||
vocab: string,
|
||||
merges: string,
|
||||
options: BPEOptions | null,
|
||||
__callback: (err: any, model: Model) => void
|
||||
): void;
|
||||
|
||||
cacheCapacity?: number;
|
||||
/**
|
||||
* Instantiate an empty BPE Model
|
||||
* The BPE dropout to use. Must be an float between 0 and 1
|
||||
*/
|
||||
export function empty(): Model;
|
||||
dropout?: number;
|
||||
/**
|
||||
* The unknown token to be used by the model
|
||||
*/
|
||||
unkToken?: string;
|
||||
/**
|
||||
* The prefix to attach to subword units that don't represent a beginning of word
|
||||
*/
|
||||
continuingSubwordPrefix?: string;
|
||||
/**
|
||||
* The suffix to attach to subword units that represent an end of word
|
||||
*/
|
||||
endOfWordSuffix?: string;
|
||||
}
|
||||
|
||||
export namespace WordPiece {
|
||||
/**
|
||||
* Instantiate a WordPiece model from the given vocab file
|
||||
*
|
||||
* @param {string} vocab Path to a vocabulary file
|
||||
* @param {string} [unkToken] The unknown token to be used by the model
|
||||
* @param {number} [maxInputCharsPerWord] The maximum number of characters to authorize in a single word
|
||||
* @param {(err: any, model: Model) => void} __callback Callback called when model is loaded
|
||||
*/
|
||||
export function fromFiles(
|
||||
vocab: string,
|
||||
unkToken: string,
|
||||
maxInputCharsPerWord: number | null,
|
||||
__callback: (err: any, model: Model) => void
|
||||
): void;
|
||||
/**
|
||||
* Instantiate a BPE model from the given vocab and merges files
|
||||
*
|
||||
* @param vocab Path to a vocabulary JSON file
|
||||
* @param merges Path to a merge file
|
||||
* @param [options] BPE model options
|
||||
*/
|
||||
export function fromFiles(
|
||||
vocab: string,
|
||||
merges: string,
|
||||
options?: BPEModelOptions
|
||||
): Model;
|
||||
|
||||
/**
|
||||
* Instantiate an empty WordPiece model
|
||||
*/
|
||||
export function empty(): Model;
|
||||
}
|
||||
/**
|
||||
* Instantiate a BPE model from the given vocab and merges files
|
||||
*
|
||||
* @param vocab Path to a vocabulary JSON file
|
||||
* @param merges Path to a merge file
|
||||
* @param options BPE model options
|
||||
* @param __callback Callback called when model is loaded
|
||||
*/
|
||||
// export function fromFiles(
|
||||
// vocab: string,
|
||||
// merges: string,
|
||||
// options: BPEModelOptions | null,
|
||||
// __callback: (err: any, model: Model) => void
|
||||
// ): void;
|
||||
|
||||
/**
|
||||
* Instantiate an empty BPE Model
|
||||
*/
|
||||
export function empty(): Model;
|
||||
}
|
||||
|
||||
export namespace wordPiece {
|
||||
export interface WordPieceModelOptions {
|
||||
/**
|
||||
* The maximum number of characters to authorize in a single word.
|
||||
* @default 100
|
||||
*/
|
||||
maxInputCharsPerWord?: number;
|
||||
/**
|
||||
* The unknown token to be used by the model.
|
||||
* @default "[UNK]"
|
||||
*/
|
||||
unkToken?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiate a WordPiece model from the given vocab file
|
||||
*
|
||||
* @param {string} vocab Path to a vocabulary file
|
||||
* @param [options] WordPiece model options
|
||||
*/
|
||||
export function fromFiles(vocab: string, options?: WordPieceModelOptions): Model;
|
||||
|
||||
/**
|
||||
* Instantiate a WordPiece model from the given vocab file
|
||||
*
|
||||
* @param vocab Path to a vocabulary file
|
||||
* @param options WordPiece model options
|
||||
* @param __callback Callback called when model is loaded
|
||||
*/
|
||||
// export function fromFiles(
|
||||
// vocab: string,
|
||||
// options: WordPieceModelOptions | null,
|
||||
// __callback: (err: any, model: Model) => void
|
||||
// ): void;
|
||||
|
||||
/**
|
||||
* Instantiate an empty WordPiece model
|
||||
*/
|
||||
export function empty(): Model;
|
||||
}
|
||||
|
12
bindings/node/lib/bindings/models.js
Normal file
12
bindings/node/lib/bindings/models.js
Normal file
@ -0,0 +1,12 @@
|
||||
var addon = require('../../native');
|
||||
|
||||
module.exports = {
|
||||
bpe: {
|
||||
fromFiles: addon.models_BPE_from_files,
|
||||
empty: addon.models_BPE_empty,
|
||||
},
|
||||
wordPiece: {
|
||||
fromFiles: addon.models_WordPiece_from_files,
|
||||
empty: addon.models_WordPiece_empty,
|
||||
}
|
||||
}
|
110
bindings/node/lib/bindings/normalizers.d.ts
vendored
110
bindings/node/lib/bindings/normalizers.d.ts
vendored
@ -4,68 +4,66 @@
|
||||
*/
|
||||
declare class Normalizer {}
|
||||
|
||||
export namespace normalizers {
|
||||
interface BertNormalizerOptions {
|
||||
/**
|
||||
* Whether to clean the text, by removing any control characters
|
||||
* and replacing all whitespaces by the classic one.
|
||||
* @default true
|
||||
*/
|
||||
cleanText?: boolean;
|
||||
/**
|
||||
* Whether to handle chinese chars by putting spaces around them.
|
||||
* @default true
|
||||
*/
|
||||
handleChineseChars?: boolean;
|
||||
/**
|
||||
* Whether to lowercase.
|
||||
* @default true
|
||||
*/
|
||||
lowercase?: boolean;
|
||||
/**
|
||||
* Whether to strip all accents.
|
||||
* @default true
|
||||
*/
|
||||
stripAccents?: boolean;
|
||||
}
|
||||
|
||||
interface BertNormalizerOptions {
|
||||
/**
|
||||
* Instantiate a Bert Normalizer with the given options
|
||||
*
|
||||
* @param [options] Normalizer options
|
||||
* @returns {Normalizer} Bert Normalizer. Takes care of normalizing raw text before giving it to a Bert model.
|
||||
* This includes cleaning the text, handling accents, chinese chars and lowercasing
|
||||
* Whether to clean the text, by removing any control characters
|
||||
* and replacing all whitespaces by the classic one.
|
||||
* @default true
|
||||
*/
|
||||
export function bertNormalizer(options?: BertNormalizerOptions): Normalizer;
|
||||
|
||||
cleanText?: boolean;
|
||||
/**
|
||||
* Returns a new NFD Unicode Normalizer
|
||||
* Whether to handle chinese chars by putting spaces around them.
|
||||
* @default true
|
||||
*/
|
||||
export function nfd(): Normalizer;
|
||||
|
||||
handleChineseChars?: boolean;
|
||||
/**
|
||||
* Returns a new NFKD Unicode Normalizer
|
||||
* Whether to lowercase.
|
||||
* @default true
|
||||
*/
|
||||
export function nfkd(): Normalizer;
|
||||
|
||||
lowercase?: boolean;
|
||||
/**
|
||||
* Returns a new NFC Unicode Normalizer
|
||||
* Whether to strip all accents.
|
||||
* @default true
|
||||
*/
|
||||
export function nfc(): Normalizer;
|
||||
|
||||
/**
|
||||
* Returns a new NFKC Unicode Normalizer
|
||||
*/
|
||||
export function nfkc(): Normalizer;
|
||||
|
||||
/**
|
||||
* Instantiate a new Normalization Sequence using the given normalizers
|
||||
* @param normalizers A list of Normalizer to be run as a sequence
|
||||
*/
|
||||
export function sequence(normalizers: Normalizer[]): Normalizer;
|
||||
|
||||
/**
|
||||
* Returns a new Lowercase Normalizer
|
||||
*/
|
||||
export function lowercase(): Normalizer;
|
||||
stripAccents?: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiate a Bert Normalizer with the given options
|
||||
*
|
||||
* @param [options] Normalizer options
|
||||
* @returns {Normalizer} Bert Normalizer. Takes care of normalizing raw text before giving it to a Bert model.
|
||||
* This includes cleaning the text, handling accents, chinese chars and lowercasing
|
||||
*/
|
||||
export function bertNormalizer(options?: BertNormalizerOptions): Normalizer;
|
||||
|
||||
/**
|
||||
* Returns a new NFD Unicode Normalizer
|
||||
*/
|
||||
export function nfdNormalizer(): Normalizer;
|
||||
|
||||
/**
|
||||
* Returns a new NFKD Unicode Normalizer
|
||||
*/
|
||||
export function nfkdNormalizer(): Normalizer;
|
||||
|
||||
/**
|
||||
* Returns a new NFC Unicode Normalizer
|
||||
*/
|
||||
export function nfcNormalizer(): Normalizer;
|
||||
|
||||
/**
|
||||
* Returns a new NFKC Unicode Normalizer
|
||||
*/
|
||||
export function nfkcNormalizer(): Normalizer;
|
||||
|
||||
/**
|
||||
* Instantiate a new Normalization Sequence using the given normalizers
|
||||
* @param normalizers A list of Normalizer to be run as a sequence
|
||||
*/
|
||||
export function sequenceNormalizer(normalizers: Normalizer[]): Normalizer;
|
||||
|
||||
/**
|
||||
* Returns a new Lowercase Normalizer
|
||||
*/
|
||||
export function lowercaseNormalizer(): Normalizer;
|
||||
|
11
bindings/node/lib/bindings/normalizers.js
Normal file
11
bindings/node/lib/bindings/normalizers.js
Normal file
@ -0,0 +1,11 @@
|
||||
var addon = require('../../native');
|
||||
|
||||
module.exports = {
|
||||
bertNormalizer: addon.normalizers_BertNormalizer,
|
||||
nfdNormalizer: addon.normalizers_NFD,
|
||||
nfkdNormalizer: addon.normalizers_NFKD,
|
||||
nfcNormalizer: addon.normalizers_NFC,
|
||||
nfkcNormalizer: addon.normalizers_NFKC,
|
||||
sequenceNormalizer: addon.normalizers_Sequence,
|
||||
lowercaseNormalizer: addon.normalizers_Lowercase
|
||||
};
|
16
bindings/node/lib/bindings/post-processors.d.ts
vendored
16
bindings/node/lib/bindings/post-processors.d.ts
vendored
@ -4,12 +4,10 @@
|
||||
*/
|
||||
declare class PostProcessor {}
|
||||
|
||||
export namespace postProcessors {
|
||||
/**
|
||||
* Instantiate a new BertProcessing with the given tokens
|
||||
*
|
||||
* @param {[string, number]} sep A tuple with the string representation of the SEP token, and its id
|
||||
* @param {[string, number]} cls A tuple with the string representation of the CLS token, and its id
|
||||
*/
|
||||
export function BertProcessing(sep: [string, number], cls: [string, number]): PostProcessor;
|
||||
}
|
||||
/**
|
||||
* Instantiate a new BertProcessing with the given tokens
|
||||
*
|
||||
* @param {[string, number]} sep A tuple with the string representation of the SEP token, and its id
|
||||
* @param {[string, number]} cls A tuple with the string representation of the CLS token, and its id
|
||||
*/
|
||||
export function bertProcessing(sep: [string, number], cls: [string, number]): PostProcessor;
|
||||
|
5
bindings/node/lib/bindings/post-processors.js
Normal file
5
bindings/node/lib/bindings/post-processors.js
Normal file
@ -0,0 +1,5 @@
|
||||
var addon = require('../../native');
|
||||
|
||||
module.exports = {
|
||||
bertProcessing: addon.processors_BertProcessing
|
||||
};
|
84
bindings/node/lib/bindings/pre-tokenizers.d.ts
vendored
84
bindings/node/lib/bindings/pre-tokenizers.d.ts
vendored
@ -4,50 +4,46 @@
|
||||
*/
|
||||
declare class PreTokenizer {}
|
||||
|
||||
export namespace preTokenizers {
|
||||
export namespace byteLevel {
|
||||
/**
|
||||
* Instantiate a new ByteLevel PreTokenizer
|
||||
*
|
||||
* @param {boolean} [addPrefixSpace=true] Whether to add a space to the first word if there isn't already one.
|
||||
* This lets us treat `hello` exactly like `say hello`.
|
||||
* @returns {PreTokenizer} ByteLevel PreTokenizer.
|
||||
* This pre-tokenizer takes care of replacing all bytes of the given string
|
||||
* with a corresponding representation, as well as splitting into words.
|
||||
*/
|
||||
export function byteLevel(addPrefixSpace?: boolean): PreTokenizer;
|
||||
|
||||
/**
|
||||
* Returns the alphabet used by the ByteLevel PreTokenizer.
|
||||
* Since the ByteLevel works as its name suggests, at the byte level, it
|
||||
* encodes any byte to one visible character. This means that there is a
|
||||
* total of 256 different characters composing this alphabet.
|
||||
*/
|
||||
export function byteLevelAlphabet(): string[];
|
||||
}
|
||||
/**
|
||||
* Instantiate a new ByteLevel PreTokenizer
|
||||
*
|
||||
* @param {boolean} [addPrefixSpace=true] Whether to add a space to the first word if there isn't already one.
|
||||
* This lets us treat `hello` exactly like `say hello`.
|
||||
* @returns {PreTokenizer} ByteLevel PreTokenizer.
|
||||
* This pre-tokenizer takes care of replacing all bytes of the given string
|
||||
* with a corresponding representation, as well as splitting into words.
|
||||
*/
|
||||
export function byteLevelPreTokenizer(addPrefixSpace?: boolean): PreTokenizer;
|
||||
|
||||
/**
|
||||
* Returns a Whitespace PreTokenizer
|
||||
* This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
|
||||
*/
|
||||
export function whitespace(): PreTokenizer;
|
||||
/**
|
||||
* Returns the alphabet used by the ByteLevel PreTokenizer.
|
||||
* Since the ByteLevel works as its name suggests, at the byte level, it
|
||||
* encodes any byte to one visible character. This means that there is a
|
||||
* total of 256 different characters composing this alphabet.
|
||||
*/
|
||||
export function byteLevelAlphabet(): string[];
|
||||
|
||||
/**
|
||||
* Returns a new Bert PreTokenizer.
|
||||
* This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
||||
* Each occurence of a punctuation character will be treated separately.
|
||||
*/
|
||||
export function bertPreTokenizer(): PreTokenizer;
|
||||
/**
|
||||
* Returns a Whitespace PreTokenizer
|
||||
* This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
|
||||
*/
|
||||
export function whitespacePreTokenizer(): PreTokenizer;
|
||||
|
||||
/**
|
||||
* Returns a new Metaspace Tokenizer.
|
||||
* This pre-tokenizer replaces any whitespace by the provided replacement character.
|
||||
* It then tries to split on these spaces.
|
||||
*
|
||||
* @param {string} [replacement="▁"] The replacement character. Must be exactly one character.
|
||||
* By default we use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
|
||||
* @param {boolean} [addPrefixSpace] Whether to add a space to the first word if there isn't already one.
|
||||
* This lets us treat `hello` exactly like `say hello`.
|
||||
*/
|
||||
export function metaspace(replacement?: string, addPrefixSpace?: boolean): PreTokenizer;
|
||||
}
|
||||
/**
|
||||
* Returns a new Bert PreTokenizer.
|
||||
* This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
||||
* Each occurence of a punctuation character will be treated separately.
|
||||
*/
|
||||
export function bertPreTokenizer(): PreTokenizer;
|
||||
|
||||
/**
|
||||
* Returns a new Metaspace Tokenizer.
|
||||
* This pre-tokenizer replaces any whitespace by the provided replacement character.
|
||||
* It then tries to split on these spaces.
|
||||
*
|
||||
* @param {string} [replacement="▁"] The replacement character. Must be exactly one character.
|
||||
* By default we use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
|
||||
* @param {boolean} [addPrefixSpace] Whether to add a space to the first word if there isn't already one.
|
||||
* This lets us treat `hello` exactly like `say hello`.
|
||||
*/
|
||||
export function metaspacePreTokenizer(replacement?: string, addPrefixSpace?: boolean): PreTokenizer;
|
||||
|
9
bindings/node/lib/bindings/pre-tokenizers.js
Normal file
9
bindings/node/lib/bindings/pre-tokenizers.js
Normal file
@ -0,0 +1,9 @@
|
||||
var addon = require('../../native');
|
||||
|
||||
module.exports = {
|
||||
byteLevelPreTokenizer: addon.pre_tokenizers_ByteLevel,
|
||||
byteLevelAlphabet: addon.pre_tokenizers_ByteLevel_Alphabet,
|
||||
whitespacePreTokenizer: addon.pre_tokenizers_Whitespace,
|
||||
bertPreTokenizer: addon.pre_tokenizers_BertPreTokenizer,
|
||||
metaspacePreTokenizer: addon.pre_tokenizers_Metaspace
|
||||
};
|
3
bindings/node/lib/bindings/tokenizer.js
Normal file
3
bindings/node/lib/bindings/tokenizer.js
Normal file
@ -0,0 +1,3 @@
|
||||
var addon = require('../../native');
|
||||
|
||||
module.exports.Tokenizer = addon.tokenizer_Tokenizer;
|
100
bindings/node/lib/bindings/trainers.d.ts
vendored
100
bindings/node/lib/bindings/trainers.d.ts
vendored
@ -4,59 +4,57 @@
|
||||
*/
|
||||
declare class Trainer {}
|
||||
|
||||
export namespace trainers {
|
||||
interface TrainerOptions {
|
||||
/**
|
||||
* A prefix to be used for every subword that is not a beginning-of-word.
|
||||
*/
|
||||
continuingSubwordPrefix?: string;
|
||||
/**
|
||||
* A suffix to be used for every subword that is a end-of-word.
|
||||
*/
|
||||
endOfWordSuffix?: string;
|
||||
/**
|
||||
* A list of characters to include in the initial alphabet, even
|
||||
* if not seen in the training dataset.
|
||||
* If the strings contains more than one character, only the first one
|
||||
* is kept.
|
||||
* @default []
|
||||
*/
|
||||
initialAlphabet?: string[];
|
||||
/**
|
||||
* The maximum different characters to keep in the alphabet.
|
||||
*/
|
||||
limitAlphabet?: number;
|
||||
/**
|
||||
* The minimum frequency a pair should have in order to be merged.
|
||||
* @default 2
|
||||
*/
|
||||
minFrequency?: number;
|
||||
/**
|
||||
* Whether to show progress bars while training.
|
||||
* @default true
|
||||
*/
|
||||
showProgress?: boolean;
|
||||
/**
|
||||
* A list of special tokens the model should know of.
|
||||
* @default []
|
||||
*/
|
||||
specialTokens?: string[];
|
||||
/**
|
||||
* The size of the final vocabulary, including all tokens and alphabet.
|
||||
* @default 30000
|
||||
*/
|
||||
vocabSize?: number;
|
||||
}
|
||||
|
||||
interface TrainerOptions {
|
||||
/**
|
||||
* Instantiate a new BPE Trainer
|
||||
* @param {TrainerOptions} [options] BPE Trainer options
|
||||
* A prefix to be used for every subword that is not a beginning-of-word.
|
||||
*/
|
||||
export function bpeTrainer(options?: TrainerOptions): Trainer;
|
||||
|
||||
continuingSubwordPrefix?: string;
|
||||
/**
|
||||
* Instantiate a new WordPiece Trainer
|
||||
* @param {TrainerOptions} [options] WordPiece Trainer options
|
||||
* A suffix to be used for every subword that is a end-of-word.
|
||||
*/
|
||||
export function wordPieceTrainer(options?: TrainerOptions): Trainer;
|
||||
endOfWordSuffix?: string;
|
||||
/**
|
||||
* A list of characters to include in the initial alphabet, even
|
||||
* if not seen in the training dataset.
|
||||
* If the strings contains more than one character, only the first one
|
||||
* is kept.
|
||||
* @default []
|
||||
*/
|
||||
initialAlphabet?: string[];
|
||||
/**
|
||||
* The maximum different characters to keep in the alphabet.
|
||||
*/
|
||||
limitAlphabet?: number;
|
||||
/**
|
||||
* The minimum frequency a pair should have in order to be merged.
|
||||
* @default 2
|
||||
*/
|
||||
minFrequency?: number;
|
||||
/**
|
||||
* Whether to show progress bars while training.
|
||||
* @default true
|
||||
*/
|
||||
showProgress?: boolean;
|
||||
/**
|
||||
* A list of special tokens the model should know of.
|
||||
* @default []
|
||||
*/
|
||||
specialTokens?: string[];
|
||||
/**
|
||||
* The size of the final vocabulary, including all tokens and alphabet.
|
||||
* @default 30000
|
||||
*/
|
||||
vocabSize?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiate a new BPE Trainer
|
||||
* @param {TrainerOptions} [options] BPE Trainer options
|
||||
*/
|
||||
export function bpeTrainer(options?: TrainerOptions): Trainer;
|
||||
|
||||
/**
|
||||
* Instantiate a new WordPiece Trainer
|
||||
* @param {TrainerOptions} [options] WordPiece Trainer options
|
||||
*/
|
||||
export function wordPieceTrainer(options?: TrainerOptions): Trainer;
|
||||
|
6
bindings/node/lib/bindings/trainers.js
Normal file
6
bindings/node/lib/bindings/trainers.js
Normal file
@ -0,0 +1,6 @@
|
||||
var addon = require('../../native');
|
||||
|
||||
module.exports = {
|
||||
bpeTrainer: addon.trainers_BPETrainer,
|
||||
wordPieceTrainer: addon.trainers_WordPieceTrainer
|
||||
};
|
@ -2,7 +2,7 @@ import { promisify } from "util";
|
||||
import { Encoding, Tokenizer } from "../bindings/tokenizer";
|
||||
|
||||
export class BaseTokenizer {
|
||||
constructor(private tokenizer: Tokenizer) {}
|
||||
constructor(protected tokenizer: Tokenizer) {}
|
||||
|
||||
/**
|
||||
* Encode the given sequence
|
||||
@ -11,7 +11,7 @@ export class BaseTokenizer {
|
||||
* @param {(string | null)} pair The optional pair sequence
|
||||
*/
|
||||
async encode(sequence: string, pair?: string): Promise<Encoding> {
|
||||
const encode = promisify(this.tokenizer.encode);
|
||||
const encode = promisify(this.tokenizer.encode.bind(this.tokenizer));
|
||||
return encode(sequence, pair ?? null);
|
||||
}
|
||||
|
||||
@ -22,7 +22,7 @@ export class BaseTokenizer {
|
||||
* The list can contain both at the same time.
|
||||
*/
|
||||
async encodeBatch(sequences: (string | [string, string])[]): Promise<Encoding[]> {
|
||||
const encodeBatch = promisify(this.tokenizer.encodeBatch);
|
||||
const encodeBatch = promisify(this.tokenizer.encodeBatch.bind(this.tokenizer));
|
||||
return encodeBatch(sequences);
|
||||
}
|
||||
}
|
||||
|
@ -1,9 +1,14 @@
|
||||
import { promisify } from "util";
|
||||
import { BaseTokenizer } from "./base.tokenizer";
|
||||
import { Tokenizer } from "../bindings/tokenizer";
|
||||
import { Model, models } from "../bindings/models";
|
||||
import { Model, wordPiece } from "../bindings/models";
|
||||
import { bertNormalizer } from "../bindings/normalizers";
|
||||
import { bertPreTokenizer } from "../bindings/pre-tokenizers";
|
||||
import { bertProcessing } from "../bindings/post-processors";
|
||||
import { wordPieceDecoder } from "../bindings/decoders";
|
||||
import { wordPieceTrainer } from "../bindings/trainers";
|
||||
|
||||
interface BertWordpieceOptions {
|
||||
export interface BertWordPieceOptions {
|
||||
/**
|
||||
* @default true
|
||||
*/
|
||||
@ -43,42 +48,121 @@ interface BertWordpieceOptions {
|
||||
wordpiecesPrefix?: string;
|
||||
}
|
||||
|
||||
const defaultBertOptions: Required<Omit<BertWordpieceOptions, 'vocabFile'>> & { vocabFile?: string } = {
|
||||
addSpecialTokens: true,
|
||||
cleanText: true,
|
||||
clsToken: '[CLS]',
|
||||
handleChineseChars: true,
|
||||
lowercase: true,
|
||||
sepToken: '[SEP]',
|
||||
stripAccents: true,
|
||||
unkToken: '[UNK]',
|
||||
wordpiecesPrefix: '##'
|
||||
};
|
||||
|
||||
/**
|
||||
* Instantiate and returns a new Bert WordPiece tokenizer
|
||||
* @param options
|
||||
*/
|
||||
export async function getBertWordpieceTokenizer(options?: BertWordpieceOptions): Promise<BertWordpieceTokenizer> {
|
||||
const mergedOptions = { ...defaultBertOptions, ...options };
|
||||
|
||||
let model: Model;
|
||||
if (mergedOptions.vocabFile) {
|
||||
const fromFiles = promisify(models.WordPiece.fromFiles);
|
||||
model = await fromFiles(mergedOptions.vocabFile, mergedOptions.unkToken, null);
|
||||
} else {
|
||||
model = models.WordPiece.empty();
|
||||
}
|
||||
|
||||
const tokenizer = new Tokenizer(model);
|
||||
return new BertWordpieceTokenizer(tokenizer);
|
||||
export interface BertWordPieceTrainOptions {
|
||||
/**
|
||||
* @default []
|
||||
*/
|
||||
initialAlphabet?: string[];
|
||||
/**
|
||||
* @default 1000
|
||||
*/
|
||||
limitAlphabet?: number;
|
||||
/**
|
||||
* @default 2
|
||||
*/
|
||||
minFrequency?: number;
|
||||
/**
|
||||
* @default true
|
||||
*/
|
||||
showProgress?: boolean;
|
||||
/**
|
||||
* @default ["[UNK]", "[SEP]", "[CLS]"]
|
||||
*/
|
||||
specialTokens?: string[];
|
||||
/**
|
||||
* @default 30000
|
||||
*/
|
||||
vocabSize?: number;
|
||||
/**
|
||||
* @default "##"
|
||||
*/
|
||||
wordpiecesPrefix?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Bert WordPiece Tokenizer
|
||||
*/
|
||||
class BertWordpieceTokenizer extends BaseTokenizer {
|
||||
constructor(tokenizer: Tokenizer) {
|
||||
export class BertWordPieceTokenizer extends BaseTokenizer {
|
||||
private static readonly defaultBertOptions:
|
||||
Required<Omit<BertWordPieceOptions, "vocabFile">> & { vocabFile?: string } = {
|
||||
addSpecialTokens: true,
|
||||
cleanText: true,
|
||||
clsToken: "[CLS]",
|
||||
handleChineseChars: true,
|
||||
lowercase: true,
|
||||
sepToken: "[SEP]",
|
||||
stripAccents: true,
|
||||
unkToken: "[UNK]",
|
||||
wordpiecesPrefix: "##"
|
||||
};
|
||||
|
||||
private readonly defaultTrainOptions: Required<BertWordPieceTrainOptions> = {
|
||||
initialAlphabet: [],
|
||||
limitAlphabet: 1000,
|
||||
minFrequency: 2,
|
||||
showProgress: true,
|
||||
specialTokens: ['<unk>'],
|
||||
vocabSize: 30000,
|
||||
wordpiecesPrefix: "##"
|
||||
};
|
||||
|
||||
private constructor(tokenizer: Tokenizer) {
|
||||
super(tokenizer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiate and returns a new Bert WordPiece tokenizer
|
||||
* @param [options] Optional tokenizer options
|
||||
*/
|
||||
static async fromOptions(options?: BertWordPieceOptions): Promise<BertWordPieceTokenizer> {
|
||||
const mergedOptions = { ...this.defaultBertOptions, ...options };
|
||||
|
||||
let model: Model;
|
||||
if (mergedOptions.vocabFile) {
|
||||
// const fromFiles = promisify(WordPiece.fromFiles);
|
||||
model = wordPiece.fromFiles(mergedOptions.vocabFile, { unkToken: mergedOptions.unkToken });
|
||||
// model = await fromFiles(mergedOptions.vocabFile, mergedOptions.unkToken, null);
|
||||
} else {
|
||||
model = wordPiece.empty();
|
||||
}
|
||||
|
||||
const tokenizer = new Tokenizer(model);
|
||||
|
||||
const normalizer = bertNormalizer(mergedOptions);
|
||||
tokenizer.setNormalizer(normalizer);
|
||||
tokenizer.setPreTokenizer(bertPreTokenizer());
|
||||
|
||||
const sepTokenId = tokenizer.tokenToId(mergedOptions.sepToken);
|
||||
if (sepTokenId === undefined) {
|
||||
throw new Error("sepToken not found in the vocabulary");
|
||||
}
|
||||
|
||||
const clsTokenId = tokenizer.tokenToId(mergedOptions.clsToken);
|
||||
if (clsTokenId === undefined) {
|
||||
throw new Error("clsToken not found in the vocabulary");
|
||||
}
|
||||
|
||||
if (mergedOptions.addSpecialTokens) {
|
||||
const processor = bertProcessing([mergedOptions.sepToken, sepTokenId], [mergedOptions.clsToken, clsTokenId]);
|
||||
tokenizer.setPostProcessor(processor);
|
||||
}
|
||||
|
||||
const decoder = wordPieceDecoder(mergedOptions.wordpiecesPrefix);
|
||||
tokenizer.setDecoder(decoder);
|
||||
|
||||
return new BertWordPieceTokenizer(tokenizer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Train the model using the given files
|
||||
*
|
||||
* @param files Files to use for training
|
||||
* @param [options] Training options
|
||||
*/
|
||||
async train(files: string[], options?: BertWordPieceTrainOptions): Promise<void> {
|
||||
const mergedOptions = { ...this.defaultTrainOptions, ...options };
|
||||
const trainer = wordPieceTrainer(mergedOptions);
|
||||
|
||||
this.tokenizer.train(trainer, files);
|
||||
}
|
||||
}
|
||||
|
@ -1,52 +1,126 @@
|
||||
import { promisify } from "util";
|
||||
import { BaseTokenizer } from "./base.tokenizer";
|
||||
import { Model, models } from "../bindings/models";
|
||||
import { Model, bpe } from "../bindings/models";
|
||||
import { Tokenizer } from "../bindings/tokenizer";
|
||||
import { sequenceNormalizer, nfkcNormalizer, lowercaseNormalizer } from "../bindings/normalizers";
|
||||
import { whitespacePreTokenizer } from "../bindings/pre-tokenizers";
|
||||
import { bpeDecoder } from "../bindings/decoders";
|
||||
import { bpeTrainer } from "../bindings/trainers";
|
||||
|
||||
interface BPEOptions {
|
||||
export interface BPETokenizerOptions {
|
||||
dropout?: number;
|
||||
mergesFile?: string;
|
||||
/**
|
||||
* @default "</w>"
|
||||
*/
|
||||
suffix?: string;
|
||||
/**
|
||||
* @default "<unk>"
|
||||
*/
|
||||
unkToken?: string;
|
||||
vocabFile?: string;
|
||||
}
|
||||
|
||||
const defaultBPEOptions: BPEOptions & Required<Pick<BPEOptions, 'unkToken' | 'suffix'>> = {
|
||||
suffix: '</w>',
|
||||
unkToken: '<unk>'
|
||||
};
|
||||
|
||||
/**
|
||||
* Instantiate and returns a new BPE tokenizer
|
||||
* @param options
|
||||
*/
|
||||
export async function getBPETokenizer(options?: BPEOptions): Promise<BPETokenizer> {
|
||||
const mergedOptions = { ...defaultBPEOptions, ...options };
|
||||
|
||||
let model: Model;
|
||||
if (mergedOptions.vocabFile && mergedOptions.mergesFile) {
|
||||
const fromFiles = promisify(models.BPE.fromFiles);
|
||||
const modelOptions: models.BPE.BPEOptions = {
|
||||
dropout: mergedOptions.dropout,
|
||||
endOfWordSuffix: mergedOptions.suffix,
|
||||
unkToken: mergedOptions.unkToken
|
||||
};
|
||||
|
||||
model = await fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, modelOptions);
|
||||
} else {
|
||||
model = models.BPE.empty();
|
||||
}
|
||||
|
||||
const tokenizer = new Tokenizer(model);
|
||||
return new BPETokenizer(tokenizer);
|
||||
export interface BPETokenizerTrainOptions {
|
||||
/**
|
||||
* @default []
|
||||
*/
|
||||
initialAlphabet?: string[];
|
||||
/**
|
||||
* @default 1000
|
||||
*/
|
||||
limitAlphabet?: number;
|
||||
/**
|
||||
* @default 2
|
||||
*/
|
||||
minFrequency?: number;
|
||||
/**
|
||||
* @default true
|
||||
*/
|
||||
showProgress?: boolean;
|
||||
/**
|
||||
* @default ["<unk>"]
|
||||
*/
|
||||
specialTokens?: string[];
|
||||
/**
|
||||
* @default "</w>"
|
||||
*/
|
||||
suffix?: string;
|
||||
/**
|
||||
* @default 30000
|
||||
*/
|
||||
vocabSize?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Original BPE Tokenizer.
|
||||
* Represents the BPE algorithm, as introduced by Rico Sennrich (https://arxiv.org/abs/1508.07909)
|
||||
*/
|
||||
class BPETokenizer extends BaseTokenizer {
|
||||
constructor(tokenizer: Tokenizer) {
|
||||
export class BPETokenizer extends BaseTokenizer {
|
||||
private static readonly defaultBPEOptions:
|
||||
BPETokenizerOptions & Required<Pick<BPETokenizerOptions, "unkToken" | "suffix">> = {
|
||||
suffix: "</w>",
|
||||
unkToken: "<unk>"
|
||||
};
|
||||
|
||||
private readonly defaultTrainOptions: Required<BPETokenizerTrainOptions> = {
|
||||
initialAlphabet: [],
|
||||
limitAlphabet: 1000,
|
||||
minFrequency: 2,
|
||||
showProgress: true,
|
||||
specialTokens: ["<unk>"],
|
||||
suffix: "</w>",
|
||||
vocabSize: 30000
|
||||
};
|
||||
|
||||
private constructor(tokenizer: Tokenizer) {
|
||||
super(tokenizer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiate and returns a new BPE tokenizer
|
||||
* @param [options] Optional tokenizer options
|
||||
*/
|
||||
static async fromOptions(options?: BPETokenizerOptions): Promise<BPETokenizer> {
|
||||
const mergedOptions = { ...this.defaultBPEOptions, ...options };
|
||||
|
||||
let model: Model;
|
||||
if (mergedOptions.vocabFile && mergedOptions.mergesFile) {
|
||||
// const fromFiles = promisify(BPE.fromFiles);
|
||||
const modelOptions: bpe.BPEModelOptions = {
|
||||
dropout: mergedOptions.dropout,
|
||||
endOfWordSuffix: mergedOptions.suffix,
|
||||
unkToken: mergedOptions.unkToken
|
||||
};
|
||||
|
||||
model = bpe.fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, modelOptions);
|
||||
// model = await fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, modelOptions);
|
||||
} else {
|
||||
model = bpe.empty();
|
||||
}
|
||||
|
||||
const tokenizer = new Tokenizer(model);
|
||||
|
||||
const normalizer = sequenceNormalizer([nfkcNormalizer(), lowercaseNormalizer()]);
|
||||
tokenizer.setNormalizer(normalizer);
|
||||
tokenizer.setPreTokenizer(whitespacePreTokenizer());
|
||||
|
||||
const decoder = bpeDecoder(mergedOptions.suffix);
|
||||
tokenizer.setDecoder(decoder);
|
||||
|
||||
return new BPETokenizer(tokenizer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Train the model using the given files
|
||||
*
|
||||
* @param files Files to use for training
|
||||
* @param [options] Training options
|
||||
*/
|
||||
async train(files: string[], options?: BPETokenizerTrainOptions): Promise<void> {
|
||||
const mergedOptions = { ...this.defaultTrainOptions, ...options };
|
||||
const trainer = bpeTrainer(mergedOptions);
|
||||
|
||||
this.tokenizer.train(trainer, files);
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,91 @@
|
||||
import { BaseTokenizer } from "./base.tokenizer";
|
||||
import { Tokenizer } from "../bindings/tokenizer";
|
||||
import { Model, bpe } from "../bindings/models";
|
||||
import { nfkcNormalizer } from "../bindings/normalizers";
|
||||
import { byteLevelPreTokenizer, byteLevelAlphabet } from "../bindings/pre-tokenizers";
|
||||
import { byteLevelDecoder } from "../bindings/decoders";
|
||||
import { bpeTrainer } from "../bindings/trainers";
|
||||
|
||||
export interface ByteLevelBPETokenizerOptions {
|
||||
/**
|
||||
* @default false
|
||||
*/
|
||||
addPrefixSpace?: boolean;
|
||||
mergesFile?: string;
|
||||
vocabFile?: string;
|
||||
}
|
||||
|
||||
export interface ByteLevelBPETrainOptions {
|
||||
/**
|
||||
* @default 2
|
||||
*/
|
||||
minFrequency?: number;
|
||||
/**
|
||||
* @default true
|
||||
*/
|
||||
showProgress?: boolean;
|
||||
/**
|
||||
* @default []
|
||||
*/
|
||||
specialTokens?: string[];
|
||||
/**
|
||||
* @default 30000
|
||||
*/
|
||||
vocabSize?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
|
||||
*/
|
||||
export class ByteLevelBPETokenizer extends BaseTokenizer {
|
||||
private static readonly defaultOptions:
|
||||
ByteLevelBPETokenizerOptions & Required<Pick<ByteLevelBPETokenizerOptions, 'addPrefixSpace'>> = {
|
||||
addPrefixSpace: false
|
||||
};
|
||||
|
||||
private readonly defaultTrainOptions: Required<ByteLevelBPETrainOptions> = {
|
||||
minFrequency: 2,
|
||||
showProgress: true,
|
||||
specialTokens: ['<unk>'],
|
||||
vocabSize: 30000
|
||||
};
|
||||
|
||||
private constructor(tokenizer: Tokenizer) {
|
||||
super(tokenizer);
|
||||
}
|
||||
|
||||
static async fromOptions(options?: ByteLevelBPETokenizerOptions): Promise<ByteLevelBPETokenizer> {
|
||||
const mergedOptions = { ...this.defaultOptions, ...options };
|
||||
|
||||
let model: Model;
|
||||
if (mergedOptions.vocabFile && mergedOptions.mergesFile) {
|
||||
// const fromFiles = promisify(BPE.fromFiles);
|
||||
model = bpe.fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile);
|
||||
// model = await fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, null);
|
||||
} else {
|
||||
model = bpe.empty();
|
||||
}
|
||||
|
||||
const tokenizer = new Tokenizer(model);
|
||||
tokenizer.setNormalizer(nfkcNormalizer());
|
||||
|
||||
const preTokenizer = byteLevelPreTokenizer(mergedOptions.addPrefixSpace);
|
||||
tokenizer.setPreTokenizer(preTokenizer);
|
||||
tokenizer.setDecoder(byteLevelDecoder());
|
||||
|
||||
return new ByteLevelBPETokenizer(tokenizer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Train the model using the given files
|
||||
*
|
||||
* @param files Files to use for training
|
||||
* @param [options] Training options
|
||||
*/
|
||||
async train(files: string[], options?: ByteLevelBPETrainOptions): Promise<void> {
|
||||
const mergedOptions = { ...this.defaultTrainOptions, ...options };
|
||||
const trainer = bpeTrainer({ ...mergedOptions, initialAlphabet: byteLevelAlphabet() });
|
||||
|
||||
this.tokenizer.train(trainer, files);
|
||||
}
|
||||
}
|
@ -1,2 +1,4 @@
|
||||
export * from './bert-wordpiece.tokenizer';
|
||||
export * from './bpe.tokenizer';
|
||||
export * from './byte-level-bpe.tokenizer';
|
||||
export * from './sentence-piece.tokenizer';
|
||||
|
121
bindings/node/lib/implementations/sentence-piece.tokenizer.ts
Normal file
121
bindings/node/lib/implementations/sentence-piece.tokenizer.ts
Normal file
@ -0,0 +1,121 @@
|
||||
import { BaseTokenizer } from "./base.tokenizer";
|
||||
import { Tokenizer } from "../bindings/tokenizer";
|
||||
import { Model, bpe } from "../bindings/models";
|
||||
import { nfkcNormalizer } from "../bindings/normalizers";
|
||||
import { metaspacePreTokenizer } from "../bindings/pre-tokenizers";
|
||||
import { metaspaceDecoder } from "../bindings/decoders";
|
||||
import { bpeTrainer } from "../bindings/trainers";
|
||||
|
||||
export interface SentencePieceTokenizerOptions extends OptionsWithDefaults {
|
||||
dropout?: number;
|
||||
mergesFile?: string;
|
||||
vocabFile?: string;
|
||||
}
|
||||
|
||||
interface OptionsWithDefaults {
|
||||
/**
|
||||
* @default true
|
||||
*/
|
||||
addPrefixSpace?: boolean;
|
||||
/**
|
||||
* @default "▁"
|
||||
*/
|
||||
replacement?: string;
|
||||
/**
|
||||
* @default "<unk>"
|
||||
*/
|
||||
unkToken?: string;
|
||||
}
|
||||
|
||||
export interface SentencePieceTrainOptions {
|
||||
/**
|
||||
* @default []
|
||||
*/
|
||||
initialAlphabet?: string[];
|
||||
/**
|
||||
* @default 1000
|
||||
*/
|
||||
limitAlphabet?: number;
|
||||
/**
|
||||
* @default 2
|
||||
*/
|
||||
minFrequency?: number;
|
||||
/**
|
||||
* @default true
|
||||
*/
|
||||
showProgress?: boolean;
|
||||
/**
|
||||
* @default ["<unk>"]
|
||||
*/
|
||||
specialTokens?: string[];
|
||||
/**
|
||||
* @default 30000
|
||||
*/
|
||||
vocabSize?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Represents the BPE algorithm, with the pretokenization used by SentencePiece
|
||||
*/
|
||||
export class SentencePieceTokenizer extends BaseTokenizer {
|
||||
private static readonly defaultOptions: SentencePieceTokenizerOptions & Required<OptionsWithDefaults> = {
|
||||
addPrefixSpace: true,
|
||||
replacement: '▁',
|
||||
unkToken: '<unk>'
|
||||
};
|
||||
|
||||
private readonly defaultTrainOptions: Required<SentencePieceTrainOptions> = {
|
||||
initialAlphabet: [],
|
||||
limitAlphabet: 1000,
|
||||
minFrequency: 2,
|
||||
showProgress: true,
|
||||
specialTokens: ['<unk>'],
|
||||
vocabSize: 30000
|
||||
};
|
||||
|
||||
private constructor(tokenizer: Tokenizer) {
|
||||
super(tokenizer);
|
||||
}
|
||||
|
||||
static async fromOptions(options?: SentencePieceTokenizerOptions): Promise<SentencePieceTokenizer> {
|
||||
const mergedOptions = { ...this.defaultOptions, ...options };
|
||||
|
||||
let model: Model;
|
||||
if (mergedOptions.vocabFile && mergedOptions.mergesFile) {
|
||||
// const fromFiles = promisify(BPE.fromFiles);
|
||||
const modelOptions: bpe.BPEModelOptions = {
|
||||
dropout: mergedOptions.dropout,
|
||||
unkToken: mergedOptions.unkToken
|
||||
};
|
||||
|
||||
model = bpe.fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, modelOptions);
|
||||
// model = await fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, null);
|
||||
} else {
|
||||
model = bpe.empty();
|
||||
}
|
||||
|
||||
const tokenizer = new Tokenizer(model);
|
||||
tokenizer.setNormalizer(nfkcNormalizer());
|
||||
|
||||
const preTokenizer = metaspacePreTokenizer(mergedOptions.replacement, mergedOptions.addPrefixSpace);
|
||||
tokenizer.setPreTokenizer(preTokenizer);
|
||||
|
||||
const decoder = metaspaceDecoder(mergedOptions.replacement, mergedOptions.addPrefixSpace);
|
||||
tokenizer.setDecoder(decoder);
|
||||
|
||||
return new SentencePieceTokenizer(tokenizer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Train the model using the given files
|
||||
*
|
||||
* @param files Files to use for training
|
||||
* @param [options] Training options
|
||||
*/
|
||||
async train(files: string[], options?: SentencePieceTrainOptions): Promise<void> {
|
||||
const mergedOptions = { ...this.defaultTrainOptions, ...options };
|
||||
const trainer = bpeTrainer(mergedOptions);
|
||||
|
||||
this.tokenizer.train(trainer, files);
|
||||
}
|
||||
}
|
1
bindings/node/lib/index.d.ts
vendored
1
bindings/node/lib/index.d.ts
vendored
@ -1,2 +1 @@
|
||||
export * from './bindings';
|
||||
export * from './implementations';
|
||||
|
@ -3,5 +3,5 @@ function __export(m) {
|
||||
for (var p in m) if (!exports.hasOwnProperty(p)) exports[p] = m[p];
|
||||
}
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
__export(require("./bindings"));
|
||||
// export * from './bindings';
|
||||
__export(require("./implementations"));
|
||||
|
@ -1,2 +1,2 @@
|
||||
export * from './bindings';
|
||||
// export * from './bindings';
|
||||
export * from './implementations';
|
||||
|
Reference in New Issue
Block a user