first implementations draft

This commit is contained in:
Pierric Cistac
2020-01-09 17:39:09 -05:00
parent 63532ef583
commit 6b0935d5de
12 changed files with 1253 additions and 6 deletions

View File

@ -9,7 +9,7 @@ declare class Model {
export namespace models {
export namespace BPE {
interface BPEOptions {
export interface BPEOptions {
/**
* The number of words that the BPE cache can contain. The cache allows
* to speed-up the process by keeping the result of the merge operations
@ -40,12 +40,40 @@ export namespace models {
* @param {string} vocab Path to a vocabulary JSON file
* @param {string} merges Path to a merge file
* @param {BPEOptions} [options] BPE model options
* @param {(err: any, model: Model) => void} __callback Callback called when model is loaded
*/
export function fromFiles(vocab: string, merges: string, options?: BPEOptions): Model;
export function fromFiles(
vocab: string,
merges: string,
options: BPEOptions | null,
__callback: (err: any, model: Model) => void
): void;
/**
* Instantiate an empty BPE Model
*/
export function empty(): Model;
}
export namespace WordPiece {
/**
* Instantiate a WordPiece model from the given vocab file
*
* @param {string} vocab Path to a vocabulary file
* @param {string} [unkToken] The unknown token to be used by the model
* @param {number} [maxInputCharsPerWord] The maximum number of characters to authorize in a single word
* @param {(err: any, model: Model) => void} __callback Callback called when model is loaded
*/
export function fromFiles(
vocab: string,
unkToken: string,
maxInputCharsPerWord: number | null,
__callback: (err: any, model: Model) => void
): void;
/**
* Instantiate an empty WordPiece model
*/
export function empty(): Model;
}
}

View File

@ -20,11 +20,11 @@ export class Tokenizer {
constructor(model: Model);
/**
* Encode the given sequence.
* Encode the given sequence
*
* @param {string} sequence The sequence to encode
* @param {(string | null)} pair The optional pair sequence
* @param {(err: any, encoding: Encoding) => void} __callback Callback to call when encoding is complete
* @param {(err: any, encoding: Encoding) => void} __callback Callback called when encoding is complete
*/
encode(sequence: string, pair: string | null, __callback: (err: any, encoding: Encoding) => void): void;
@ -33,7 +33,7 @@ export class Tokenizer {
*
* @param {((string | [string, string])[])} sequences A list of sequences or pair of sequences.
* The list can contain both at the same time.
* @param {(err: any, encodings: Encoding[]) => void} __callback Callback to call when encoding is complete
* @param {(err: any, encodings: Encoding[]) => void} __callback Callback called when encoding is complete
*/
encodeBatch(sequences: (string | [string, string])[], __callback: (err: any, encodings: Encoding[]) => void): void;
@ -54,7 +54,7 @@ export class Tokenizer {
* @param model New model to use
* @throws Will throw an error if any task is running
*/
withModel(model: Model): void;
setModel(model: Model): void;
}
/**
@ -76,6 +76,11 @@ declare class Encoding {
*/
getOffsets(): [number, number][];
/**
* Returns the overflowing encoding, after truncation
*/
getOverflowing(): Encoding | undefined;
/**
* Returns the special tokens mask
*/
@ -90,4 +95,43 @@ declare class Encoding {
* Returns the type ids
*/
getTypeIds(): number[];
/**
* Pad the current Encoding at the given length
*
* @param {number} length The length at which to pad
* @param {PaddingOptions} [options] Padding options
*/
pad(length: number, options?: PaddingOptions): void;
/**
* Truncate the current Encoding at the given max_length
*
* @param {number} length The maximum length to be kept
* @param {number} [stride=0] The length of the previous first sequence
* to be includedin the overflowing sequence
*/
truncate(length: number, stride?: number): void;
}
interface PaddingOptions {
/**
* @default "right"
*/
direction?: 'left' | 'right';
/**
* The indice to be used when padding
* @default 0
*/
padId?: number;
/**
* The type indice to be used when padding
* @default 0
*/
padTypeId?: number;
/**
* The pad token to be used when padding
* @default "[PAD]"
*/
padToken?: string;
}

View File

@ -0,0 +1,2 @@
*.d.ts
*.js

View File

@ -0,0 +1,28 @@
import { promisify } from "util";
import { Encoding, Tokenizer } from "../bindings/tokenizer";
export class BaseTokenizer {
constructor(private tokenizer: Tokenizer) {}
/**
* Encode the given sequence
*
* @param {string} sequence The sequence to encode
* @param {(string | null)} pair The optional pair sequence
*/
async encode(sequence: string, pair?: string): Promise<Encoding> {
const encode = promisify(this.tokenizer.encode);
return encode(sequence, pair ?? null);
}
/**
* Encode the given sequences or pair of sequences
*
* @param {((string | [string, string])[])} sequences A list of sequences or pair of sequences.
* The list can contain both at the same time.
*/
async encodeBatch(sequences: (string | [string, string])[]): Promise<Encoding[]> {
const encodeBatch = promisify(this.tokenizer.encodeBatch);
return encodeBatch(sequences);
}
}

View File

@ -0,0 +1,84 @@
import { promisify } from "util";
import { BaseTokenizer } from "./base.tokenizer";
import { Tokenizer } from "../bindings/tokenizer";
import { Model, models } from "../bindings/models";
interface BertWordpieceOptions {
/**
* @default true
*/
addSpecialTokens?: boolean;
/**
* @default true
*/
cleanText?: boolean;
/**
* @default "[CLS]"
*/
clsToken?: string;
/**
* @default true
*/
handleChineseChars?: boolean;
/**
* @default true
*/
lowercase?: boolean;
/**
* @default "[SEP]"
*/
sepToken?: string;
/**
* @default true
*/
stripAccents?: boolean;
/**
* @default "[UNK]"
*/
unkToken?: string;
vocabFile?: string;
/**
* @default "##"
*/
wordpiecesPrefix?: string;
}
const defaultBertOptions: Required<Omit<BertWordpieceOptions, 'vocabFile'>> & { vocabFile?: string } = {
addSpecialTokens: true,
cleanText: true,
clsToken: '[CLS]',
handleChineseChars: true,
lowercase: true,
sepToken: '[SEP]',
stripAccents: true,
unkToken: '[UNK]',
wordpiecesPrefix: '##'
};
/**
* Instantiate and returns a new Bert WordPiece tokenizer
* @param options
*/
export async function getBertWordpieceTokenizer(options?: BertWordpieceOptions): Promise<BertWordpieceTokenizer> {
const mergedOptions = { ...defaultBertOptions, ...options };
let model: Model;
if (mergedOptions.vocabFile) {
const fromFiles = promisify(models.WordPiece.fromFiles);
model = await fromFiles(mergedOptions.vocabFile, mergedOptions.unkToken, null);
} else {
model = models.WordPiece.empty();
}
const tokenizer = new Tokenizer(model);
return new BertWordpieceTokenizer(tokenizer);
}
/**
* Bert WordPiece Tokenizer
*/
class BertWordpieceTokenizer extends BaseTokenizer {
constructor(tokenizer: Tokenizer) {
super(tokenizer);
}
}

View File

@ -0,0 +1,52 @@
import { promisify } from "util";
import { BaseTokenizer } from "./base.tokenizer";
import { Model, models } from "../bindings/models";
import { Tokenizer } from "../bindings/tokenizer";
interface BPEOptions {
dropout?: number;
mergesFile?: string;
suffix?: string;
unkToken?: string;
vocabFile?: string;
}
const defaultBPEOptions: BPEOptions & Required<Pick<BPEOptions, 'unkToken' | 'suffix'>> = {
suffix: '</w>',
unkToken: '<unk>'
};
/**
* Instantiate and returns a new BPE tokenizer
* @param options
*/
export async function getBPETokenizer(options?: BPEOptions): Promise<BPETokenizer> {
const mergedOptions = { ...defaultBPEOptions, ...options };
let model: Model;
if (mergedOptions.vocabFile && mergedOptions.mergesFile) {
const fromFiles = promisify(models.BPE.fromFiles);
const modelOptions: models.BPE.BPEOptions = {
dropout: mergedOptions.dropout,
endOfWordSuffix: mergedOptions.suffix,
unkToken: mergedOptions.unkToken
};
model = await fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, modelOptions);
} else {
model = models.BPE.empty();
}
const tokenizer = new Tokenizer(model);
return new BPETokenizer(tokenizer);
}
/**
* Original BPE Tokenizer.
* Represents the BPE algorithm, as introduced by Rico Sennrich (https://arxiv.org/abs/1508.07909)
*/
class BPETokenizer extends BaseTokenizer {
constructor(tokenizer: Tokenizer) {
super(tokenizer);
}
}

View File

@ -0,0 +1,2 @@
export * from './bert-wordpiece.tokenizer';
export * from './bpe.tokenizer';

View File

@ -1 +1,2 @@
export * from './bindings';
export * from './implementations';

View File

@ -4,3 +4,4 @@ function __export(m) {
}
Object.defineProperty(exports, "__esModule", { value: true });
__export(require("./bindings"));
__export(require("./implementations"));

View File

@ -1 +1,2 @@
export * from './bindings';
export * from './implementations';

1003
bindings/node/package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

View File

@ -11,6 +11,7 @@
"author": "Anthony MOI <m.anthony.moi@gmail.com>",
"license": "Apache-2.0",
"dependencies": {
"@types/node": "^13.1.6",
"neon-cli": "^0.3.3",
"node-pre-gyp": "github:amilajack/node-pre-gyp#neon-compat"
},