mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
first implementations draft
This commit is contained in:
32
bindings/node/lib/bindings/models.d.ts
vendored
32
bindings/node/lib/bindings/models.d.ts
vendored
@ -9,7 +9,7 @@ declare class Model {
|
||||
|
||||
export namespace models {
|
||||
export namespace BPE {
|
||||
interface BPEOptions {
|
||||
export interface BPEOptions {
|
||||
/**
|
||||
* The number of words that the BPE cache can contain. The cache allows
|
||||
* to speed-up the process by keeping the result of the merge operations
|
||||
@ -40,12 +40,40 @@ export namespace models {
|
||||
* @param {string} vocab Path to a vocabulary JSON file
|
||||
* @param {string} merges Path to a merge file
|
||||
* @param {BPEOptions} [options] BPE model options
|
||||
* @param {(err: any, model: Model) => void} __callback Callback called when model is loaded
|
||||
*/
|
||||
export function fromFiles(vocab: string, merges: string, options?: BPEOptions): Model;
|
||||
export function fromFiles(
|
||||
vocab: string,
|
||||
merges: string,
|
||||
options: BPEOptions | null,
|
||||
__callback: (err: any, model: Model) => void
|
||||
): void;
|
||||
|
||||
/**
|
||||
* Instantiate an empty BPE Model
|
||||
*/
|
||||
export function empty(): Model;
|
||||
}
|
||||
|
||||
export namespace WordPiece {
|
||||
/**
|
||||
* Instantiate a WordPiece model from the given vocab file
|
||||
*
|
||||
* @param {string} vocab Path to a vocabulary file
|
||||
* @param {string} [unkToken] The unknown token to be used by the model
|
||||
* @param {number} [maxInputCharsPerWord] The maximum number of characters to authorize in a single word
|
||||
* @param {(err: any, model: Model) => void} __callback Callback called when model is loaded
|
||||
*/
|
||||
export function fromFiles(
|
||||
vocab: string,
|
||||
unkToken: string,
|
||||
maxInputCharsPerWord: number | null,
|
||||
__callback: (err: any, model: Model) => void
|
||||
): void;
|
||||
|
||||
/**
|
||||
* Instantiate an empty WordPiece model
|
||||
*/
|
||||
export function empty(): Model;
|
||||
}
|
||||
}
|
||||
|
52
bindings/node/lib/bindings/tokenizer.d.ts
vendored
52
bindings/node/lib/bindings/tokenizer.d.ts
vendored
@ -20,11 +20,11 @@ export class Tokenizer {
|
||||
constructor(model: Model);
|
||||
|
||||
/**
|
||||
* Encode the given sequence.
|
||||
* Encode the given sequence
|
||||
*
|
||||
* @param {string} sequence The sequence to encode
|
||||
* @param {(string | null)} pair The optional pair sequence
|
||||
* @param {(err: any, encoding: Encoding) => void} __callback Callback to call when encoding is complete
|
||||
* @param {(err: any, encoding: Encoding) => void} __callback Callback called when encoding is complete
|
||||
*/
|
||||
encode(sequence: string, pair: string | null, __callback: (err: any, encoding: Encoding) => void): void;
|
||||
|
||||
@ -33,7 +33,7 @@ export class Tokenizer {
|
||||
*
|
||||
* @param {((string | [string, string])[])} sequences A list of sequences or pair of sequences.
|
||||
* The list can contain both at the same time.
|
||||
* @param {(err: any, encodings: Encoding[]) => void} __callback Callback to call when encoding is complete
|
||||
* @param {(err: any, encodings: Encoding[]) => void} __callback Callback called when encoding is complete
|
||||
*/
|
||||
encodeBatch(sequences: (string | [string, string])[], __callback: (err: any, encodings: Encoding[]) => void): void;
|
||||
|
||||
@ -54,7 +54,7 @@ export class Tokenizer {
|
||||
* @param model New model to use
|
||||
* @throws Will throw an error if any task is running
|
||||
*/
|
||||
withModel(model: Model): void;
|
||||
setModel(model: Model): void;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -76,6 +76,11 @@ declare class Encoding {
|
||||
*/
|
||||
getOffsets(): [number, number][];
|
||||
|
||||
/**
|
||||
* Returns the overflowing encoding, after truncation
|
||||
*/
|
||||
getOverflowing(): Encoding | undefined;
|
||||
|
||||
/**
|
||||
* Returns the special tokens mask
|
||||
*/
|
||||
@ -90,4 +95,43 @@ declare class Encoding {
|
||||
* Returns the type ids
|
||||
*/
|
||||
getTypeIds(): number[];
|
||||
|
||||
/**
|
||||
* Pad the current Encoding at the given length
|
||||
*
|
||||
* @param {number} length The length at which to pad
|
||||
* @param {PaddingOptions} [options] Padding options
|
||||
*/
|
||||
pad(length: number, options?: PaddingOptions): void;
|
||||
|
||||
/**
|
||||
* Truncate the current Encoding at the given max_length
|
||||
*
|
||||
* @param {number} length The maximum length to be kept
|
||||
* @param {number} [stride=0] The length of the previous first sequence
|
||||
* to be includedin the overflowing sequence
|
||||
*/
|
||||
truncate(length: number, stride?: number): void;
|
||||
}
|
||||
|
||||
interface PaddingOptions {
|
||||
/**
|
||||
* @default "right"
|
||||
*/
|
||||
direction?: 'left' | 'right';
|
||||
/**
|
||||
* The indice to be used when padding
|
||||
* @default 0
|
||||
*/
|
||||
padId?: number;
|
||||
/**
|
||||
* The type indice to be used when padding
|
||||
* @default 0
|
||||
*/
|
||||
padTypeId?: number;
|
||||
/**
|
||||
* The pad token to be used when padding
|
||||
* @default "[PAD]"
|
||||
*/
|
||||
padToken?: string;
|
||||
}
|
||||
|
2
bindings/node/lib/implementations/.gitignore
vendored
Normal file
2
bindings/node/lib/implementations/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
*.d.ts
|
||||
*.js
|
28
bindings/node/lib/implementations/base.tokenizer.ts
Normal file
28
bindings/node/lib/implementations/base.tokenizer.ts
Normal file
@ -0,0 +1,28 @@
|
||||
import { promisify } from "util";
|
||||
import { Encoding, Tokenizer } from "../bindings/tokenizer";
|
||||
|
||||
export class BaseTokenizer {
|
||||
constructor(private tokenizer: Tokenizer) {}
|
||||
|
||||
/**
|
||||
* Encode the given sequence
|
||||
*
|
||||
* @param {string} sequence The sequence to encode
|
||||
* @param {(string | null)} pair The optional pair sequence
|
||||
*/
|
||||
async encode(sequence: string, pair?: string): Promise<Encoding> {
|
||||
const encode = promisify(this.tokenizer.encode);
|
||||
return encode(sequence, pair ?? null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Encode the given sequences or pair of sequences
|
||||
*
|
||||
* @param {((string | [string, string])[])} sequences A list of sequences or pair of sequences.
|
||||
* The list can contain both at the same time.
|
||||
*/
|
||||
async encodeBatch(sequences: (string | [string, string])[]): Promise<Encoding[]> {
|
||||
const encodeBatch = promisify(this.tokenizer.encodeBatch);
|
||||
return encodeBatch(sequences);
|
||||
}
|
||||
}
|
@ -0,0 +1,84 @@
|
||||
import { promisify } from "util";
|
||||
import { BaseTokenizer } from "./base.tokenizer";
|
||||
import { Tokenizer } from "../bindings/tokenizer";
|
||||
import { Model, models } from "../bindings/models";
|
||||
|
||||
interface BertWordpieceOptions {
|
||||
/**
|
||||
* @default true
|
||||
*/
|
||||
addSpecialTokens?: boolean;
|
||||
/**
|
||||
* @default true
|
||||
*/
|
||||
cleanText?: boolean;
|
||||
/**
|
||||
* @default "[CLS]"
|
||||
*/
|
||||
clsToken?: string;
|
||||
/**
|
||||
* @default true
|
||||
*/
|
||||
handleChineseChars?: boolean;
|
||||
/**
|
||||
* @default true
|
||||
*/
|
||||
lowercase?: boolean;
|
||||
/**
|
||||
* @default "[SEP]"
|
||||
*/
|
||||
sepToken?: string;
|
||||
/**
|
||||
* @default true
|
||||
*/
|
||||
stripAccents?: boolean;
|
||||
/**
|
||||
* @default "[UNK]"
|
||||
*/
|
||||
unkToken?: string;
|
||||
vocabFile?: string;
|
||||
/**
|
||||
* @default "##"
|
||||
*/
|
||||
wordpiecesPrefix?: string;
|
||||
}
|
||||
|
||||
const defaultBertOptions: Required<Omit<BertWordpieceOptions, 'vocabFile'>> & { vocabFile?: string } = {
|
||||
addSpecialTokens: true,
|
||||
cleanText: true,
|
||||
clsToken: '[CLS]',
|
||||
handleChineseChars: true,
|
||||
lowercase: true,
|
||||
sepToken: '[SEP]',
|
||||
stripAccents: true,
|
||||
unkToken: '[UNK]',
|
||||
wordpiecesPrefix: '##'
|
||||
};
|
||||
|
||||
/**
|
||||
* Instantiate and returns a new Bert WordPiece tokenizer
|
||||
* @param options
|
||||
*/
|
||||
export async function getBertWordpieceTokenizer(options?: BertWordpieceOptions): Promise<BertWordpieceTokenizer> {
|
||||
const mergedOptions = { ...defaultBertOptions, ...options };
|
||||
|
||||
let model: Model;
|
||||
if (mergedOptions.vocabFile) {
|
||||
const fromFiles = promisify(models.WordPiece.fromFiles);
|
||||
model = await fromFiles(mergedOptions.vocabFile, mergedOptions.unkToken, null);
|
||||
} else {
|
||||
model = models.WordPiece.empty();
|
||||
}
|
||||
|
||||
const tokenizer = new Tokenizer(model);
|
||||
return new BertWordpieceTokenizer(tokenizer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Bert WordPiece Tokenizer
|
||||
*/
|
||||
class BertWordpieceTokenizer extends BaseTokenizer {
|
||||
constructor(tokenizer: Tokenizer) {
|
||||
super(tokenizer);
|
||||
}
|
||||
}
|
52
bindings/node/lib/implementations/bpe.tokenizer.ts
Normal file
52
bindings/node/lib/implementations/bpe.tokenizer.ts
Normal file
@ -0,0 +1,52 @@
|
||||
import { promisify } from "util";
|
||||
import { BaseTokenizer } from "./base.tokenizer";
|
||||
import { Model, models } from "../bindings/models";
|
||||
import { Tokenizer } from "../bindings/tokenizer";
|
||||
|
||||
interface BPEOptions {
|
||||
dropout?: number;
|
||||
mergesFile?: string;
|
||||
suffix?: string;
|
||||
unkToken?: string;
|
||||
vocabFile?: string;
|
||||
}
|
||||
|
||||
const defaultBPEOptions: BPEOptions & Required<Pick<BPEOptions, 'unkToken' | 'suffix'>> = {
|
||||
suffix: '</w>',
|
||||
unkToken: '<unk>'
|
||||
};
|
||||
|
||||
/**
|
||||
* Instantiate and returns a new BPE tokenizer
|
||||
* @param options
|
||||
*/
|
||||
export async function getBPETokenizer(options?: BPEOptions): Promise<BPETokenizer> {
|
||||
const mergedOptions = { ...defaultBPEOptions, ...options };
|
||||
|
||||
let model: Model;
|
||||
if (mergedOptions.vocabFile && mergedOptions.mergesFile) {
|
||||
const fromFiles = promisify(models.BPE.fromFiles);
|
||||
const modelOptions: models.BPE.BPEOptions = {
|
||||
dropout: mergedOptions.dropout,
|
||||
endOfWordSuffix: mergedOptions.suffix,
|
||||
unkToken: mergedOptions.unkToken
|
||||
};
|
||||
|
||||
model = await fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, modelOptions);
|
||||
} else {
|
||||
model = models.BPE.empty();
|
||||
}
|
||||
|
||||
const tokenizer = new Tokenizer(model);
|
||||
return new BPETokenizer(tokenizer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Original BPE Tokenizer.
|
||||
* Represents the BPE algorithm, as introduced by Rico Sennrich (https://arxiv.org/abs/1508.07909)
|
||||
*/
|
||||
class BPETokenizer extends BaseTokenizer {
|
||||
constructor(tokenizer: Tokenizer) {
|
||||
super(tokenizer);
|
||||
}
|
||||
}
|
2
bindings/node/lib/implementations/index.ts
Normal file
2
bindings/node/lib/implementations/index.ts
Normal file
@ -0,0 +1,2 @@
|
||||
export * from './bert-wordpiece.tokenizer';
|
||||
export * from './bpe.tokenizer';
|
1
bindings/node/lib/index.d.ts
vendored
1
bindings/node/lib/index.d.ts
vendored
@ -1 +1,2 @@
|
||||
export * from './bindings';
|
||||
export * from './implementations';
|
||||
|
@ -4,3 +4,4 @@ function __export(m) {
|
||||
}
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
__export(require("./bindings"));
|
||||
__export(require("./implementations"));
|
||||
|
@ -1 +1,2 @@
|
||||
export * from './bindings';
|
||||
export * from './implementations';
|
||||
|
1003
bindings/node/package-lock.json
generated
Normal file
1003
bindings/node/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
@ -11,6 +11,7 @@
|
||||
"author": "Anthony MOI <m.anthony.moi@gmail.com>",
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"@types/node": "^13.1.6",
|
||||
"neon-cli": "^0.3.3",
|
||||
"node-pre-gyp": "github:amilajack/node-pre-gyp#neon-compat"
|
||||
},
|
||||
|
Reference in New Issue
Block a user