add more methods on tokenizer

2025-08-22 16:25:30 +00:00 · 2020-01-09 18:06:01 -05:00
parent 66d65595f6
commit 969d994f70
1 changed files with 88 additions and 8 deletions
--- a/bindings/node/lib/bindings/tokenizer.d.ts
+++ b/bindings/node/lib/bindings/tokenizer.d.ts
@ -1,4 +1,5 @@
 import { Model } from "./models";
+import { Decoder } from "./decoders";

 /**
 * A Tokenizer works as a pipeline, it processes some raw text as input and outputs
@ -18,25 +19,86 @@ export class Tokenizer {
   * Instantiate a new Tokenizer using the given Model
   */
  constructor(model: Model);
+
+  /**
+   * Add the given tokens to the vocabulary
+   *
+   * @param tokens A list of tokens to add to the vocabulary.
+   * Each token can either be a string, or a tuple with a string representing the token,
+   * and a boolean option representing whether to match on single words only.
+   * If the boolean is not included, it defaults to False
+   * @returns The number of tokens that were added to the vocabulary
+   */
+  addTokens(tokens: (string | [string, boolean])[]): number;
+
+  /**
+   * Add the given special tokens to the vocabulary, and treat them as special tokens.
+   * The special tokens will never be processed by the model, and will be removed while decoding.
+   *
+   * @param tokens The list of special tokens to add
+   * @returns {number} The number of tokens that were added to the vocabulary
+   */
+  addSpecialTokens(tokens: string[]): number;
  
  /**
   * Encode the given sequence
   *
-   * @param {string} sequence The sequence to encode
-   * @param {(string | null)} pair The optional pair sequence
-   * @param {(err: any, encoding: Encoding) => void} __callback Callback called when encoding is complete
+   * @param sequence The sequence to encode
+   * @param pair The optional pair sequence
+   * @param __callback Callback called when encoding is complete
   */
  encode(sequence: string, pair: string | null, __callback: (err: any, encoding: Encoding) => void): void;

  /**
   * Encode the given sequences or pair of sequences
   *
-   * @param {((string | [string, string])[])} sequences A list of sequences or pair of sequences.
-   * The list can contain both at the same time.
-   * @param {(err: any, encodings: Encoding[]) => void} __callback Callback called when encoding is complete
+   * @param sequences A list of sequences or pair of sequences. The list can contain both at the same time.
+   * @param __callback Callback called when encoding is complete
   */
  encodeBatch(sequences: (string | [string, string])[], __callback: (err: any, encodings: Encoding[]) => void): void;

+  /**
+   * Decode the given list of ids to a string sequence
+   *
+   * @param ids A list of ids to be decoded
+   * @param {boolean} [skipSpecialTokens=true] Whether to remove all the special tokens from the output string
+   * @returns The decoded string
+   */
+  decode(ids: number[], skipSpecialTokens?: boolean): string;
+
+  /**
+   * Decode the list of sequences to a list of string sequences
+   *
+   * @param sequences A list of sequence of ids to be decoded
+   * @param {boolean} [skipSpecialTokens] Whether to remove all the special tokens from the output strings
+   * @returns A list of decoded strings
+   */
+  decodeBatch(sequences: number[][], skipSpecialTokens?: boolean): string[];
+
+  /**
+   * Convert the given token id to its corresponding string
+   *
+   * @param id The token id to convert
+   * @returns The corresponding string if it exists
+   */
+  idToToken(id: number): string | undefined;
+
+  /**
+   * Convert the given token to its corresponding id
+   *
+   * @param token The token to convert
+   * @returns The corresponding id if it exists
+   */
+  tokenToId(token: string): number | undefined;
+
+  /**
+   * Train the model using the given files
+   *
+   * @param trainer Trainer to use
+   * @param files List of files to use
+   */
+  train(trainer: Trainer, files: string[]): void;
+
  /**
   * Returns the size of the vocabulary
   *
@ -49,12 +111,30 @@ export class Tokenizer {
   */
  runningTasks(): number;

+  getModel(): Model;
+
  /**
   * Change the model to use with this Tokenizer
   * @param model New model to use
   * @throws Will throw an error if any task is running
   */
  setModel(model: Model): void;
+
+  getNormalizer(): Normalizer;
+
+  setNormalizer(normalizer: Normalizer): void;
+
+  getPreTokenizer(): PreTokenizer;
+
+  setPreTokenizer(preTokenizer: PreTokenizer): void;
+
+  getPostProcessor(): PostProcessor;
+
+  setPostProcessor(processor: PostProcessor): void;
+
+  getDecoder(): Decoder;
+
+  setDecoder(decoder: Decoder): void;
 }

 /**
@ -99,7 +179,7 @@ declare class Encoding {
  /**
   * Pad the current Encoding at the given length
   *
-   * @param {number} length The length at which to pad
+   * @param length The length at which to pad
   * @param {PaddingOptions} [options] Padding options
   */
  pad(length: number, options?: PaddingOptions): void;
@ -107,7 +187,7 @@ declare class Encoding {
  /**
   * Truncate the current Encoding at the given max_length
   *
-   * @param {number} length The maximum length to be kept
+   * @param length The maximum length to be kept
   * @param {number} [stride=0] The length of the previous first sequence
   * to be includedin the overflowing sequence
   */