first implementations draft

2025-08-22 16:25:30 +00:00 · 2020-01-09 17:39:09 -05:00
parent 63532ef583
commit 6b0935d5de
12 changed files with 1253 additions and 6 deletions
--- a/bindings/node/lib/bindings/models.d.ts
+++ b/bindings/node/lib/bindings/models.d.ts
@ -9,7 +9,7 @@ declare class Model {

 export namespace models {
  export namespace BPE {
-    interface BPEOptions {
+    export interface BPEOptions {
      /**
       * The number of words that the BPE cache can contain. The cache allows
       * to speed-up the process by keeping the result of the merge operations
@ -40,12 +40,40 @@ export namespace models {
     * @param {string} vocab Path to a vocabulary JSON file
     * @param {string} merges Path to a merge file
     * @param {BPEOptions} [options] BPE model options
+     * @param {(err: any, model: Model) => void} __callback Callback called when model is loaded
     */
-    export function fromFiles(vocab: string, merges: string, options?: BPEOptions): Model;
+    export function fromFiles(
+      vocab: string,
+      merges: string,
+      options: BPEOptions | null,
+      __callback: (err: any, model: Model) => void
+    ): void;

    /**
     * Instantiate an empty BPE Model
     */
    export function empty(): Model;
  }
+
+  export namespace WordPiece {
+    /**
+     * Instantiate a WordPiece model from the given vocab file
+     *
+     * @param {string} vocab Path to a vocabulary file
+     * @param {string} [unkToken] The unknown token to be used by the model
+     * @param {number} [maxInputCharsPerWord] The maximum number of characters to authorize in a single word
+     * @param {(err: any, model: Model) => void} __callback Callback called when model is loaded
+     */
+    export function fromFiles(
+      vocab: string,
+      unkToken: string,
+      maxInputCharsPerWord: number | null,
+      __callback: (err: any, model: Model) => void
+    ): void;
+
+    /**
+     * Instantiate an empty WordPiece model
+     */
+    export function empty(): Model;
+  }
 }
--- a/bindings/node/lib/bindings/tokenizer.d.ts
+++ b/bindings/node/lib/bindings/tokenizer.d.ts
@ -20,11 +20,11 @@ export class Tokenizer {
  constructor(model: Model);
  
  /**
-   * Encode the given sequence.
+   * Encode the given sequence
   *
   * @param {string} sequence The sequence to encode
   * @param {(string | null)} pair The optional pair sequence
-   * @param {(err: any, encoding: Encoding) => void} __callback Callback to call when encoding is complete
+   * @param {(err: any, encoding: Encoding) => void} __callback Callback called when encoding is complete
   */
  encode(sequence: string, pair: string | null, __callback: (err: any, encoding: Encoding) => void): void;

@ -33,7 +33,7 @@ export class Tokenizer {
   *
   * @param {((string | [string, string])[])} sequences A list of sequences or pair of sequences.
   * The list can contain both at the same time.
-   * @param {(err: any, encodings: Encoding[]) => void} __callback Callback to call when encoding is complete
+   * @param {(err: any, encodings: Encoding[]) => void} __callback Callback called when encoding is complete
   */
  encodeBatch(sequences: (string | [string, string])[], __callback: (err: any, encodings: Encoding[]) => void): void;

@ -54,7 +54,7 @@ export class Tokenizer {
   * @param model New model to use
   * @throws Will throw an error if any task is running
   */
-  withModel(model: Model): void;
+  setModel(model: Model): void;
 }

 /**
@ -76,6 +76,11 @@ declare class Encoding {
   */
  getOffsets(): [number, number][];

+  /**
+   * Returns the overflowing encoding, after truncation
+   */
+  getOverflowing(): Encoding | undefined;
+
  /**
   * Returns the special tokens mask
   */
@ -90,4 +95,43 @@ declare class Encoding {
   * Returns the type ids
   */
  getTypeIds(): number[];
+
+  /**
+   * Pad the current Encoding at the given length
+   *
+   * @param {number} length The length at which to pad
+   * @param {PaddingOptions} [options] Padding options
+   */
+  pad(length: number, options?: PaddingOptions): void;
+
+  /**
+   * Truncate the current Encoding at the given max_length
+   *
+   * @param {number} length The maximum length to be kept
+   * @param {number} [stride=0] The length of the previous first sequence
+   * to be includedin the overflowing sequence
+   */
+  truncate(length: number, stride?: number): void;
+}
+
+interface PaddingOptions {
+  /**
+   * @default "right"
+   */
+  direction?: 'left' | 'right';
+  /**
+   * The indice to be used when padding
+   * @default 0
+   */
+  padId?: number;
+  /**
+   * The type indice to be used when padding
+   * @default 0
+   */
+  padTypeId?: number;
+  /**
+   * The pad token to be used when padding
+   * @default "[PAD]"
+   */
+  padToken?: string;
 }
--- a/bindings/node/lib/implementations/.gitignore
+++ b/bindings/node/lib/implementations/.gitignore
@ -0,0 +1,2 @@
+*.d.ts
+*.js
--- a/bindings/node/lib/implementations/base.tokenizer.ts
+++ b/bindings/node/lib/implementations/base.tokenizer.ts
@ -0,0 +1,28 @@
+import { promisify } from "util";
+import { Encoding, Tokenizer } from "../bindings/tokenizer";
+
+export class BaseTokenizer {
+  constructor(private tokenizer: Tokenizer) {}
+
+  /**
+   * Encode the given sequence
+   *
+   * @param {string} sequence The sequence to encode
+   * @param {(string | null)} pair The optional pair sequence
+   */
+  async encode(sequence: string, pair?: string): Promise<Encoding> {
+    const encode = promisify(this.tokenizer.encode);
+    return encode(sequence, pair ?? null);
+  }
+
+  /**
+   * Encode the given sequences or pair of sequences
+   *
+   * @param {((string | [string, string])[])} sequences A list of sequences or pair of sequences.
+   * The list can contain both at the same time.
+   */
+  async encodeBatch(sequences: (string | [string, string])[]): Promise<Encoding[]> {
+    const encodeBatch = promisify(this.tokenizer.encodeBatch);
+    return encodeBatch(sequences);
+  }
+}
--- a/bindings/node/lib/implementations/bert-wordpiece.tokenizer.ts
+++ b/bindings/node/lib/implementations/bert-wordpiece.tokenizer.ts
@ -0,0 +1,84 @@
+import { promisify } from "util";
+import { BaseTokenizer } from "./base.tokenizer";
+import { Tokenizer } from "../bindings/tokenizer";
+import { Model, models } from "../bindings/models";
+
+interface BertWordpieceOptions {
+  /**
+   * @default true
+   */
+  addSpecialTokens?:   boolean;
+  /**
+   * @default true
+   */
+  cleanText?:          boolean;
+  /**
+   * @default "[CLS]"
+   */
+  clsToken?:           string;
+  /**
+   * @default true
+   */
+  handleChineseChars?: boolean;
+  /**
+   * @default true
+   */
+  lowercase?:          boolean;
+  /**
+   * @default "[SEP]"
+   */
+  sepToken?:           string;
+  /**
+   * @default true
+   */
+  stripAccents?:       boolean;
+  /**
+   * @default "[UNK]"
+   */
+  unkToken?:           string;
+  vocabFile?:          string;
+  /**
+   * @default "##"
+   */
+  wordpiecesPrefix?:   string;
+}
+
+const defaultBertOptions: Required<Omit<BertWordpieceOptions, 'vocabFile'>> & { vocabFile?: string } = {
+  addSpecialTokens:   true,
+  cleanText:          true,
+  clsToken:           '[CLS]',
+  handleChineseChars: true,
+  lowercase:          true,
+  sepToken:           '[SEP]',
+  stripAccents:       true,
+  unkToken:           '[UNK]',
+  wordpiecesPrefix:   '##'
+};
+
+/**
+ * Instantiate and returns a new Bert WordPiece tokenizer
+ * @param options 
+ */
+export async function getBertWordpieceTokenizer(options?: BertWordpieceOptions): Promise<BertWordpieceTokenizer> {
+  const mergedOptions = { ...defaultBertOptions, ...options };
+
+  let model: Model;
+  if (mergedOptions.vocabFile) {
+    const fromFiles = promisify(models.WordPiece.fromFiles);
+    model = await fromFiles(mergedOptions.vocabFile, mergedOptions.unkToken, null);
+  } else {
+    model = models.WordPiece.empty();
+  }
+
+  const tokenizer = new Tokenizer(model);
+  return new BertWordpieceTokenizer(tokenizer);
+}
+
+/**
+ * Bert WordPiece Tokenizer
+ */
+class BertWordpieceTokenizer extends BaseTokenizer {
+  constructor(tokenizer: Tokenizer) {
+    super(tokenizer);
+  }
+}
--- a/bindings/node/lib/implementations/bpe.tokenizer.ts
+++ b/bindings/node/lib/implementations/bpe.tokenizer.ts
@ -0,0 +1,52 @@
+import { promisify } from "util";
+import { BaseTokenizer } from "./base.tokenizer";
+import { Model, models } from "../bindings/models";
+import { Tokenizer } from "../bindings/tokenizer";
+
+interface BPEOptions {
+  dropout?:    number;
+  mergesFile?: string;
+  suffix?:     string;
+  unkToken?:   string;
+  vocabFile?:  string;
+}
+
+const defaultBPEOptions: BPEOptions & Required<Pick<BPEOptions, 'unkToken' | 'suffix'>> = {
+  suffix:   '</w>',
+  unkToken: '<unk>'
+};
+
+/**
+ * Instantiate and returns a new BPE tokenizer
+ * @param options 
+ */
+export async function getBPETokenizer(options?: BPEOptions): Promise<BPETokenizer> {
+  const mergedOptions = { ...defaultBPEOptions, ...options };
+
+  let model: Model;
+  if (mergedOptions.vocabFile && mergedOptions.mergesFile) {
+    const fromFiles = promisify(models.BPE.fromFiles);
+    const modelOptions: models.BPE.BPEOptions = {
+      dropout:         mergedOptions.dropout,
+      endOfWordSuffix: mergedOptions.suffix,
+      unkToken:        mergedOptions.unkToken
+    };
+
+    model = await fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, modelOptions);
+  } else {
+    model = models.BPE.empty();
+  }
+
+  const tokenizer = new Tokenizer(model);
+  return new BPETokenizer(tokenizer);
+}
+
+/**
+ * Original BPE Tokenizer.
+ * Represents the BPE algorithm, as introduced by Rico Sennrich (https://arxiv.org/abs/1508.07909)
+ */
+class BPETokenizer extends BaseTokenizer {
+  constructor(tokenizer: Tokenizer) {
+    super(tokenizer);
+  }
+}
--- a/bindings/node/lib/implementations/index.ts
+++ b/bindings/node/lib/implementations/index.ts
@ -0,0 +1,2 @@
+export * from './bert-wordpiece.tokenizer';
+export * from './bpe.tokenizer';
--- a/bindings/node/lib/index.d.ts
+++ b/bindings/node/lib/index.d.ts
@ -1 +1,2 @@
 export * from './bindings';
+export * from './implementations';
--- a/bindings/node/lib/index.js
+++ b/bindings/node/lib/index.js
@ -4,3 +4,4 @@ function __export(m) {
 }
 Object.defineProperty(exports, "__esModule", { value: true });
 __export(require("./bindings"));
+__export(require("./implementations"));
--- a/bindings/node/lib/index.ts
+++ b/bindings/node/lib/index.ts
@ -1 +1,2 @@
 export * from './bindings';
+export * from './implementations';
--- a/bindings/node/package-lock.json
+++ b/bindings/node/package-lock.json
--- a/bindings/node/package.json
+++ b/bindings/node/package.json
@ -11,6 +11,7 @@
  "author": "Anthony MOI <m.anthony.moi@gmail.com>",
  "license": "Apache-2.0",
  "dependencies": {
+    "@types/node": "^13.1.6",
    "neon-cli": "^0.3.3",
    "node-pre-gyp": "github:amilajack/node-pre-gyp#neon-compat"
  },