big big big

2025-09-01 14:59:20 +00:00 · 2020-01-10 14:49:13 -05:00
parent 34875d5771
commit 80f6d58177
24 changed files with 762 additions and 345 deletions
--- a/bindings/node/lib/bindings/decoders.d.ts
+++ b/bindings/node/lib/bindings/decoders.d.ts
@ -4,32 +4,30 @@
 */
 declare class Decoder {}
-export namespace decoders {
+/**
-  /**
+ * Instantiate a new ByteLevel Decoder
-   * Instantiate a new ByteLevel Decoder
+ */
-   */
+export function byteLevelDecoder(): Decoder;
  export function ByteLevel(): Decoder;
-  /**
+/**
-   * Instantiate a new WordPiece Decoder
+ * Instantiate a new WordPiece Decoder
-   * @param {string} [prefix='##'] The prefix to use for subwords that are not a beginning-of-word
+ * @param {string} [prefix='##'] The prefix to use for subwords that are not a beginning-of-word
-   */
+ */
-  export function WordPiece(prefix?: string): Decoder;
+export function wordPieceDecoder(prefix?: string): Decoder;
-  /**
+/**
-   * Instantiate a new Metaspace
+ * Instantiate a new Metaspace
-   *
+ *
-   * @param {string} [replacement='▁'] The replacement character. 
+ * @param {string} [replacement='▁'] The replacement character. 
-   * Must be exactly one character. By default we use the `▁` (U+2581) meta symbol (same as in SentencePiece).
+ * Must be exactly one character. By default we use the `▁` (U+2581) meta symbol (same as in SentencePiece).
-   * @param {boolean} [addPrefixSpace=true] Whether to add a space to the first word if there isn't already one.
+ * @param {boolean} [addPrefixSpace=true] Whether to add a space to the first word if there isn't already one.
-   * This lets us treat `hello` exactly like `say hello`.
+ * This lets us treat `hello` exactly like `say hello`.
-   */
+ */
-  export function Metaspace(replacement?: string, addPrefixSpace?: boolean): Decoder;
+export function metaspaceDecoder(replacement?: string, addPrefixSpace?: boolean): Decoder;
-  /**
+/**
-   * Instantiate a new BPEDecoder
+ * Instantiate a new BPE Decoder
-   * @param {string} [suffix='</w>'] The suffix that was used to caracterize an end-of-word.
+ * @param {string} [suffix='</w>'] The suffix that was used to caracterize an end-of-word.
-   * This suffix will be replaced by whitespaces during the decoding
+ * This suffix will be replaced by whitespaces during the decoding
-   */
+ */
-  export function BPEDecoder(suffix?: string): Decoder;
+export function bpeDecoder(suffix?: string): Decoder;
 }
--- a/bindings/node/lib/bindings/decoders.js
+++ b/bindings/node/lib/bindings/decoders.js
@ -0,0 +1,8 @@
 var addon = require('../../native');
 module.exports = {
  byteLevelDecoder: addon.decoders_ByteLevel,
  wordPieceDecoder: addon.decoders_WordPiece,
  metaspaceDecoder: addon.decoders_Metaspace,
  bpeDecoder:       addon.decoders_BPEDecoder
 };
--- a/bindings/node/lib/bindings/index.d.ts
+++ b/bindings/node/lib/bindings/index.d.ts
@ -1,7 +0,0 @@
 export { decoders } from './decoders';
 export { models } from './models';
 export { normalizers } from './normalizers';
 export { preTokenizers } from './pre-tokenizers';
 export { postProcessors } from './post-processors';
 export { Tokenizer } from './tokenizer';
 export { trainers } from './trainers'
--- a/bindings/node/lib/bindings/index.js
+++ b/bindings/node/lib/bindings/index.js
@ -1,19 +0,0 @@
 var addon = require('../../native');
 exports.Tokenizer = addon.tokenizer_Tokenizer;
 exports.models = {
 	BPE: {
 		fromFiles: addon.models_BPE_from_files,
 		empty: addon.models_BPE_empty,
 	},
 	WordPiece: addon.models_WordPiece,
 }
 exports.decoders = {
 	ByteLevel: addon.decoders_ByteLevel,
 	WordPiece: addon.decoders_WordPiece,
 	Metaspace: addon.decoders_Metaspace,
 	BPEDecoder: addon.decoders_BPEDecoder,
 }
 exports.post_processors = {
 	BertProcessing: addon.processors_BertProcessing,
 }
--- a/bindings/node/lib/bindings/models.d.ts
+++ b/bindings/node/lib/bindings/models.d.ts
@ -6,73 +6,103 @@ declare class Model {
 }
-export namespace models {
+export namespace bpe {
-  export namespace BPE {
+  export interface BPEModelOptions {
    export interface BPEOptions {
      /**
       * The number of words that the BPE cache can contain. The cache allows
       * to speed-up the process by keeping the result of the merge operations
       * for a number of words.
       */
      cacheCapacity?: number;
      /**
       * The BPE dropout to use. Must be an float between 0 and 1
       */
      dropout?: number;
      /**
       * The unknown token to be used by the model
       */
      unkToken?: string;
      /**
       * The prefix to attach to subword units that don't represent a beginning of word
       */
      continuingSubwordPrefix?: string;
      /**
       * The suffix to attach to subword units that represent an end of word
       */
      endOfWordSuffix?: string;
    }
    /**
-     * Instantiate a BPE model from the given vocab and merges files
+     * The number of words that the BPE cache can contain. The cache allows
-     *
+     * to speed-up the process by keeping the result of the merge operations
-     * @param {string} vocab Path to a vocabulary JSON file
+     * for a number of words.
     * @param {string} merges Path to a merge file
     * @param {BPEOptions} [options] BPE model options
     * @param {(err: any, model: Model) => void} __callback Callback called when model is loaded
     */
-    export function fromFiles(
+    cacheCapacity?: number;
      vocab: string,
      merges: string,
      options: BPEOptions | null,
      __callback: (err: any, model: Model) => void
    ): void;
    /**
-     * Instantiate an empty BPE Model
+     * The BPE dropout to use. Must be an float between 0 and 1
     */
-    export function empty(): Model;
+    dropout?: number;
    /**
     * The unknown token to be used by the model
     */
    unkToken?: string;
    /**
     * The prefix to attach to subword units that don't represent a beginning of word
     */
    continuingSubwordPrefix?: string;
    /**
     * The suffix to attach to subword units that represent an end of word
     */
    endOfWordSuffix?: string;
  }
-  export namespace WordPiece {
+  /**
-    /**
+   * Instantiate a BPE model from the given vocab and merges files
-     * Instantiate a WordPiece model from the given vocab file
+   *
-     *
+   * @param vocab Path to a vocabulary JSON file
-     * @param {string} vocab Path to a vocabulary file
+   * @param merges Path to a merge file
-     * @param {string} [unkToken] The unknown token to be used by the model
+   * @param [options] BPE model options
-     * @param {number} [maxInputCharsPerWord] The maximum number of characters to authorize in a single word
+   */
-     * @param {(err: any, model: Model) => void} __callback Callback called when model is loaded
+  export function fromFiles(
-     */
+    vocab: string,
-    export function fromFiles(
+    merges: string,
-      vocab: string,
+    options?: BPEModelOptions
-      unkToken: string,
+  ): Model;
      maxInputCharsPerWord: number | null,
      __callback: (err: any, model: Model) => void
    ): void;
-    /**
+  /**
-     * Instantiate an empty WordPiece model
+   * Instantiate a BPE model from the given vocab and merges files
-     */
+   *
-    export function empty(): Model;
+   * @param vocab Path to a vocabulary JSON file
-  }
+   * @param merges Path to a merge file
   * @param options BPE model options
   * @param __callback Callback called when model is loaded
   */
  // export function fromFiles(
  //   vocab: string,
  //   merges: string,
  //   options: BPEModelOptions | null,
  //   __callback: (err: any, model: Model) => void
  // ): void;
  /**
   * Instantiate an empty BPE Model
   */
  export function empty(): Model;
 }
 export namespace wordPiece {
  export interface WordPieceModelOptions {
    /**
     * The maximum number of characters to authorize in a single word.
     * @default 100
     */
    maxInputCharsPerWord?: number;
    /**
     * The unknown token to be used by the model.
     * @default "[UNK]"
     */
    unkToken?:             string;
  }
  /**
   * Instantiate a WordPiece model from the given vocab file
   *
   * @param {string} vocab Path to a vocabulary file
   * @param [options] WordPiece model options
   */
  export function fromFiles(vocab: string, options?: WordPieceModelOptions): Model;
  /**
   * Instantiate a WordPiece model from the given vocab file
   *
   * @param vocab Path to a vocabulary file
   * @param options WordPiece model options
   * @param __callback Callback called when model is loaded
   */
  // export function fromFiles(
  //   vocab: string,
  //   options: WordPieceModelOptions | null,
  //   __callback: (err: any, model: Model) => void
  // ): void;
  /**
   * Instantiate an empty WordPiece model
   */
  export function empty(): Model;
 }
--- a/bindings/node/lib/bindings/models.js
+++ b/bindings/node/lib/bindings/models.js
@ -0,0 +1,12 @@
 var addon = require('../../native');
 module.exports = {
 	bpe: {
 		fromFiles: addon.models_BPE_from_files,
 		empty: addon.models_BPE_empty,
 	},
 	wordPiece: {
 		fromFiles: addon.models_WordPiece_from_files,
 		empty: addon.models_WordPiece_empty,
 	}
 }
--- a/bindings/node/lib/bindings/normalizers.d.ts
+++ b/bindings/node/lib/bindings/normalizers.d.ts
@ -4,68 +4,66 @@
 */
 declare class Normalizer {}
-export namespace normalizers {
+interface BertNormalizerOptions {
  interface BertNormalizerOptions {
    /**
     * Whether to clean the text, by removing any control characters
     * and replacing all whitespaces by the classic one.
     * @default true
     */
    cleanText?:          boolean;
    /**
     * Whether to handle chinese chars by putting spaces around them.
     * @default true
     */
    handleChineseChars?: boolean;
    /**
     * Whether to lowercase.
     * @default true
     */
    lowercase?:          boolean;
    /**
     * Whether to strip all accents.
     * @default true
     */
    stripAccents?:       boolean;
  }
  /**
-   * Instantiate a Bert Normalizer with the given options
+   * Whether to clean the text, by removing any control characters
-   *
+   * and replacing all whitespaces by the classic one.
-   * @param [options] Normalizer options
+   * @default true
   * @returns {Normalizer} Bert Normalizer. Takes care of normalizing raw text before giving it to a Bert model.
   * This includes cleaning the text, handling accents, chinese chars and lowercasing
   */
-  export function bertNormalizer(options?: BertNormalizerOptions): Normalizer;
+  cleanText?:          boolean;
  /**
-   * Returns a new NFD Unicode Normalizer
+   * Whether to handle chinese chars by putting spaces around them.
   * @default true
   */
-  export function nfd(): Normalizer;
+  handleChineseChars?: boolean;
  /**
-   * Returns a new NFKD Unicode Normalizer
+   * Whether to lowercase.
   * @default true
   */
-  export function nfkd(): Normalizer;
+  lowercase?:          boolean;
  /**
-   * Returns a new NFC Unicode Normalizer
+   * Whether to strip all accents.
   * @default true
   */
-  export function nfc(): Normalizer;
+  stripAccents?:       boolean;
  /**
   * Returns a new NFKC Unicode Normalizer
   */
  export function nfkc(): Normalizer;
  /**
   * Instantiate a new Normalization Sequence using the given normalizers
   * @param normalizers A list of Normalizer to be run as a sequence
   */
  export function sequence(normalizers: Normalizer[]): Normalizer;
  /**
   * Returns a new Lowercase Normalizer
   */
  export function lowercase(): Normalizer;
 }
 /**
 * Instantiate a Bert Normalizer with the given options
 *
 * @param [options] Normalizer options
 * @returns {Normalizer} Bert Normalizer. Takes care of normalizing raw text before giving it to a Bert model.
 * This includes cleaning the text, handling accents, chinese chars and lowercasing
 */
 export function bertNormalizer(options?: BertNormalizerOptions): Normalizer;
 /**
 * Returns a new NFD Unicode Normalizer
 */
 export function nfdNormalizer(): Normalizer;
 /**
 * Returns a new NFKD Unicode Normalizer
 */
 export function nfkdNormalizer(): Normalizer;
 /**
 * Returns a new NFC Unicode Normalizer
 */
 export function nfcNormalizer(): Normalizer;
 /**
 * Returns a new NFKC Unicode Normalizer
 */
 export function nfkcNormalizer(): Normalizer;
 /**
 * Instantiate a new Normalization Sequence using the given normalizers
 * @param normalizers A list of Normalizer to be run as a sequence
 */
 export function sequenceNormalizer(normalizers: Normalizer[]): Normalizer;
 /**
 * Returns a new Lowercase Normalizer
 */
 export function lowercaseNormalizer(): Normalizer;
--- a/bindings/node/lib/bindings/normalizers.js
+++ b/bindings/node/lib/bindings/normalizers.js
@ -0,0 +1,11 @@
 var addon = require('../../native');
 module.exports = {
  bertNormalizer:      addon.normalizers_BertNormalizer,
  nfdNormalizer:       addon.normalizers_NFD,
  nfkdNormalizer:      addon.normalizers_NFKD,
  nfcNormalizer:       addon.normalizers_NFC,
  nfkcNormalizer:      addon.normalizers_NFKC,
  sequenceNormalizer:  addon.normalizers_Sequence,
  lowercaseNormalizer: addon.normalizers_Lowercase
 };
--- a/bindings/node/lib/bindings/post-processors.d.ts
+++ b/bindings/node/lib/bindings/post-processors.d.ts
@ -4,12 +4,10 @@
 */
 declare class PostProcessor {}
-export namespace postProcessors {
+/**
-  /**
+ * Instantiate a new BertProcessing with the given tokens
-   * Instantiate a new BertProcessing with the given tokens
+ *
-   *
+ * @param {[string, number]} sep A tuple with the string representation of the SEP token, and its id
-   * @param {[string, number]} sep A tuple with the string representation of the SEP token, and its id
+ * @param {[string, number]} cls A tuple with the string representation of the CLS token, and its id
-   * @param {[string, number]} cls A tuple with the string representation of the CLS token, and its id
+ */
-   */
+export function bertProcessing(sep: [string, number], cls: [string, number]): PostProcessor;
  export function BertProcessing(sep: [string, number], cls: [string, number]): PostProcessor;
 }
--- a/bindings/node/lib/bindings/post-processors.js
+++ b/bindings/node/lib/bindings/post-processors.js
@ -0,0 +1,5 @@
 var addon = require('../../native');
 module.exports = {
  bertProcessing: addon.processors_BertProcessing
 };
--- a/bindings/node/lib/bindings/pre-tokenizers.d.ts
+++ b/bindings/node/lib/bindings/pre-tokenizers.d.ts
@ -4,50 +4,46 @@
 */
 declare class PreTokenizer {}
-export namespace preTokenizers {
+/**
-  export namespace byteLevel {
+ * Instantiate a new ByteLevel PreTokenizer
-    /**
+ *
-     * Instantiate a new ByteLevel PreTokenizer
+ * @param {boolean} [addPrefixSpace=true] Whether to add a space to the first word if there isn't already one.
-     *
+ * This lets us treat `hello` exactly like `say hello`.
-     * @param {boolean} [addPrefixSpace=true] Whether to add a space to the first word if there isn't already one.
+ * @returns {PreTokenizer} ByteLevel PreTokenizer. 
-     * This lets us treat `hello` exactly like `say hello`.
+ * This pre-tokenizer takes care of replacing all bytes of the given string
-     * @returns {PreTokenizer} ByteLevel PreTokenizer. 
+ * with a corresponding representation, as well as splitting into words.
-     * This pre-tokenizer takes care of replacing all bytes of the given string
+ */
-     * with a corresponding representation, as well as splitting into words.
+export function byteLevelPreTokenizer(addPrefixSpace?: boolean): PreTokenizer;
     */
    export function byteLevel(addPrefixSpace?: boolean): PreTokenizer;
    /**
     * Returns the alphabet used by the ByteLevel PreTokenizer.
     * Since the ByteLevel works as its name suggests, at the byte level, it
     * encodes any byte to one visible character. This means that there is a
     * total of 256 different characters composing this alphabet.
     */
    export function byteLevelAlphabet(): string[];
  }
-  /**
+/**
-   * Returns a Whitespace PreTokenizer
+ * Returns the alphabet used by the ByteLevel PreTokenizer.
-   * This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
+ * Since the ByteLevel works as its name suggests, at the byte level, it
-   */
+ * encodes any byte to one visible character. This means that there is a
-  export function whitespace(): PreTokenizer;
+ * total of 256 different characters composing this alphabet.
 */
 export function byteLevelAlphabet(): string[];
-  /**
+/**
-   * Returns a new Bert PreTokenizer.
+ * Returns a Whitespace PreTokenizer
-   * This pre-tokenizer splits tokens on spaces, and also on punctuation.
+ * This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
-   * Each occurence of a punctuation character will be treated separately.
+ */
-   */
+export function whitespacePreTokenizer(): PreTokenizer;
  export function bertPreTokenizer(): PreTokenizer;
-  /**
+/**
-   * Returns a new Metaspace Tokenizer.
+ * Returns a new Bert PreTokenizer.
-   * This pre-tokenizer replaces any whitespace by the provided replacement character.
+ * This pre-tokenizer splits tokens on spaces, and also on punctuation.
-   * It then tries to split on these spaces.
+ * Each occurence of a punctuation character will be treated separately.
-   *
+ */
-   * @param {string} [replacement="▁"] The replacement character. Must be exactly one character.
+export function bertPreTokenizer(): PreTokenizer;
-   * By default we use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
+
-   * @param {boolean} [addPrefixSpace] Whether to add a space to the first word if there isn't already one.
+/**
-   * This lets us treat `hello` exactly like `say hello`.
+ * Returns a new Metaspace Tokenizer.
-   */
+ * This pre-tokenizer replaces any whitespace by the provided replacement character.
-  export function metaspace(replacement?: string, addPrefixSpace?: boolean): PreTokenizer;
+ * It then tries to split on these spaces.
-}
+ *
 * @param {string} [replacement="▁"] The replacement character. Must be exactly one character.
 * By default we use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
 * @param {boolean} [addPrefixSpace] Whether to add a space to the first word if there isn't already one.
 * This lets us treat `hello` exactly like `say hello`.
 */
 export function metaspacePreTokenizer(replacement?: string, addPrefixSpace?: boolean): PreTokenizer;
--- a/bindings/node/lib/bindings/pre-tokenizers.js
+++ b/bindings/node/lib/bindings/pre-tokenizers.js
@ -0,0 +1,9 @@
 var addon = require('../../native');
 module.exports = {
  byteLevelPreTokenizer:  addon.pre_tokenizers_ByteLevel,
  byteLevelAlphabet:      addon.pre_tokenizers_ByteLevel_Alphabet,
  whitespacePreTokenizer: addon.pre_tokenizers_Whitespace,
  bertPreTokenizer:       addon.pre_tokenizers_BertPreTokenizer,
  metaspacePreTokenizer:  addon.pre_tokenizers_Metaspace
 };
--- a/bindings/node/lib/bindings/tokenizer.js
+++ b/bindings/node/lib/bindings/tokenizer.js
@ -0,0 +1,3 @@
 var addon = require('../../native');
 module.exports.Tokenizer = addon.tokenizer_Tokenizer;
--- a/bindings/node/lib/bindings/trainers.d.ts
+++ b/bindings/node/lib/bindings/trainers.d.ts
@ -4,59 +4,57 @@
 */
 declare class Trainer {}
-export namespace trainers {
+interface TrainerOptions {
  interface TrainerOptions {
    /**
     * A prefix to be used for every subword that is not a beginning-of-word.
     */
    continuingSubwordPrefix?: string;
    /**
     * A suffix to be used for every subword that is a end-of-word.
     */
    endOfWordSuffix?:         string;
    /**
     * A list of characters to include in the initial alphabet, even
     * if not seen in the training dataset.
     * If the strings contains more than one character, only the first one
     * is kept.
     * @default []
     */
    initialAlphabet?:         string[];
    /**
     * The maximum different characters to keep in the alphabet.
     */
    limitAlphabet?:           number;
    /**
     * The minimum frequency a pair should have in order to be merged.
     * @default 2
     */
    minFrequency?:            number;
    /**
     * Whether to show progress bars while training.
     * @default true
     */
    showProgress?:            boolean;
    /**
     * A list of special tokens the model should know of.
     * @default []
     */
    specialTokens?:           string[];
    /**
     * The size of the final vocabulary, including all tokens and alphabet.
     * @default 30000
     */
    vocabSize?:               number;
  }
  /**
-   * Instantiate a new BPE Trainer
+   * A prefix to be used for every subword that is not a beginning-of-word.
   * @param {TrainerOptions} [options] BPE Trainer options
   */
-  export function bpeTrainer(options?: TrainerOptions): Trainer;
+  continuingSubwordPrefix?: string;
  /**
-   * Instantiate a new WordPiece Trainer
+   * A suffix to be used for every subword that is a end-of-word.
   * @param {TrainerOptions} [options] WordPiece Trainer options
   */
-  export function wordPieceTrainer(options?: TrainerOptions): Trainer;
+  endOfWordSuffix?:         string;
  /**
   * A list of characters to include in the initial alphabet, even
   * if not seen in the training dataset.
   * If the strings contains more than one character, only the first one
   * is kept.
   * @default []
   */
  initialAlphabet?:         string[];
  /**
   * The maximum different characters to keep in the alphabet.
   */
  limitAlphabet?:           number;
  /**
   * The minimum frequency a pair should have in order to be merged.
   * @default 2
   */
  minFrequency?:            number;
  /**
   * Whether to show progress bars while training.
   * @default true
   */
  showProgress?:            boolean;
  /**
   * A list of special tokens the model should know of.
   * @default []
   */
  specialTokens?:           string[];
  /**
   * The size of the final vocabulary, including all tokens and alphabet.
   * @default 30000
   */
  vocabSize?:               number;
 }
 /**
 * Instantiate a new BPE Trainer
 * @param {TrainerOptions} [options] BPE Trainer options
 */
 export function bpeTrainer(options?: TrainerOptions): Trainer;
 /**
 * Instantiate a new WordPiece Trainer
 * @param {TrainerOptions} [options] WordPiece Trainer options
 */
 export function wordPieceTrainer(options?: TrainerOptions): Trainer;
--- a/bindings/node/lib/bindings/trainers.js
+++ b/bindings/node/lib/bindings/trainers.js
@ -0,0 +1,6 @@
 var addon = require('../../native');
 module.exports = {
  bpeTrainer:       addon.trainers_BPETrainer,
  wordPieceTrainer: addon.trainers_WordPieceTrainer
 };
--- a/bindings/node/lib/implementations/base.tokenizer.ts
+++ b/bindings/node/lib/implementations/base.tokenizer.ts
@ -2,7 +2,7 @@ import { promisify } from "util";
 import { Encoding, Tokenizer } from "../bindings/tokenizer";
 export class BaseTokenizer {
-  constructor(private tokenizer: Tokenizer) {}
+  constructor(protected tokenizer: Tokenizer) {}
  /**
   * Encode the given sequence
@ -11,7 +11,7 @@ export class BaseTokenizer {
   * @param {(string | null)} pair The optional pair sequence
   */
  async encode(sequence: string, pair?: string): Promise<Encoding> {
-    const encode = promisify(this.tokenizer.encode);
+    const encode = promisify(this.tokenizer.encode.bind(this.tokenizer));
    return encode(sequence, pair ?? null);
  }
@ -22,7 +22,7 @@ export class BaseTokenizer {
   * The list can contain both at the same time.
   */
  async encodeBatch(sequences: (string | [string, string])[]): Promise<Encoding[]> {
-    const encodeBatch = promisify(this.tokenizer.encodeBatch);
+    const encodeBatch = promisify(this.tokenizer.encodeBatch.bind(this.tokenizer));
    return encodeBatch(sequences);
  }
 }
--- a/bindings/node/lib/implementations/bert-wordpiece.tokenizer.ts
+++ b/bindings/node/lib/implementations/bert-wordpiece.tokenizer.ts
@ -1,9 +1,14 @@
 import { promisify } from "util";
 import { BaseTokenizer } from "./base.tokenizer";
 import { Tokenizer } from "../bindings/tokenizer";
-import { Model, models } from "../bindings/models";
+import { Model, wordPiece } from "../bindings/models";
 import { bertNormalizer } from "../bindings/normalizers";
 import { bertPreTokenizer } from "../bindings/pre-tokenizers";
 import { bertProcessing } from "../bindings/post-processors";
 import { wordPieceDecoder } from "../bindings/decoders";
 import { wordPieceTrainer } from "../bindings/trainers";
-interface BertWordpieceOptions {
+export interface BertWordPieceOptions {
  /**
   * @default true
   */
@ -43,42 +48,121 @@ interface BertWordpieceOptions {
  wordpiecesPrefix?:   string;
 }
-const defaultBertOptions: Required<Omit<BertWordpieceOptions, 'vocabFile'>> & { vocabFile?: string } = {
+export interface BertWordPieceTrainOptions {
-  addSpecialTokens:   true,
+  /**
-  cleanText:          true,
+   * @default []
-  clsToken:           '[CLS]',
+   */
-  handleChineseChars: true,
+  initialAlphabet?:  string[];
-  lowercase:          true,
+  /**
-  sepToken:           '[SEP]',
+   * @default 1000
-  stripAccents:       true,
+   */
-  unkToken:           '[UNK]',
+  limitAlphabet?:    number;
-  wordpiecesPrefix:   '##'
+  /**
-};
+   * @default 2
-
+   */
-/**
+  minFrequency?:     number;
- * Instantiate and returns a new Bert WordPiece tokenizer
+  /**
- * @param options 
+   * @default true
- */
+   */
-export async function getBertWordpieceTokenizer(options?: BertWordpieceOptions): Promise<BertWordpieceTokenizer> {
+  showProgress?:     boolean;
-  const mergedOptions = { ...defaultBertOptions, ...options };
+  /**
-
+   * @default ["[UNK]", "[SEP]", "[CLS]"]
-  let model: Model;
+   */
-  if (mergedOptions.vocabFile) {
+  specialTokens?:    string[];
-    const fromFiles = promisify(models.WordPiece.fromFiles);
+  /**
-    model = await fromFiles(mergedOptions.vocabFile, mergedOptions.unkToken, null);
+   * @default 30000
-  } else {
+   */
-    model = models.WordPiece.empty();
+  vocabSize?:        number;
-  }
+  /**
-
+   * @default "##"
-  const tokenizer = new Tokenizer(model);
+   */
-  return new BertWordpieceTokenizer(tokenizer);
+  wordpiecesPrefix?: string;
 }
 /**
 * Bert WordPiece Tokenizer
 */
-class BertWordpieceTokenizer extends BaseTokenizer {
+export class BertWordPieceTokenizer extends BaseTokenizer {
-  constructor(tokenizer: Tokenizer) {
+  private static readonly defaultBertOptions:
    Required<Omit<BertWordPieceOptions, "vocabFile">> & { vocabFile?: string } = {
    addSpecialTokens:   true,
    cleanText:          true,
    clsToken:           "[CLS]",
    handleChineseChars: true,
    lowercase:          true,
    sepToken:           "[SEP]",
    stripAccents:       true,
    unkToken:           "[UNK]",
    wordpiecesPrefix:   "##"
  };
  private readonly defaultTrainOptions: Required<BertWordPieceTrainOptions> = {
    initialAlphabet:  [],
    limitAlphabet:    1000,
    minFrequency:     2,
    showProgress:     true,
    specialTokens:    ['<unk>'],
    vocabSize:        30000,
    wordpiecesPrefix: "##"
  };
  private constructor(tokenizer: Tokenizer) {
    super(tokenizer);
  }
  /**
   * Instantiate and returns a new Bert WordPiece tokenizer
   * @param [options] Optional tokenizer options 
   */
  static async fromOptions(options?: BertWordPieceOptions): Promise<BertWordPieceTokenizer> {
    const mergedOptions = { ...this.defaultBertOptions, ...options };
    let model: Model;
    if (mergedOptions.vocabFile) {
      // const fromFiles = promisify(WordPiece.fromFiles);
      model = wordPiece.fromFiles(mergedOptions.vocabFile, { unkToken: mergedOptions.unkToken });
      // model = await fromFiles(mergedOptions.vocabFile, mergedOptions.unkToken, null);
    } else {
      model = wordPiece.empty();
    }
    const tokenizer = new Tokenizer(model);
    const normalizer = bertNormalizer(mergedOptions);
    tokenizer.setNormalizer(normalizer);
    tokenizer.setPreTokenizer(bertPreTokenizer());
    const sepTokenId = tokenizer.tokenToId(mergedOptions.sepToken);
    if (sepTokenId === undefined) {
      throw new Error("sepToken not found in the vocabulary");
    }
    const clsTokenId = tokenizer.tokenToId(mergedOptions.clsToken);
    if (clsTokenId === undefined) {
      throw new Error("clsToken not found in the vocabulary");
    }
    if (mergedOptions.addSpecialTokens) {
      const processor = bertProcessing([mergedOptions.sepToken, sepTokenId], [mergedOptions.clsToken, clsTokenId]);
      tokenizer.setPostProcessor(processor);
    }
    const decoder = wordPieceDecoder(mergedOptions.wordpiecesPrefix);
    tokenizer.setDecoder(decoder);
    return new BertWordPieceTokenizer(tokenizer);
  }
  /**
   * Train the model using the given files
   *
   * @param files Files to use for training
   * @param [options] Training options
   */
  async train(files: string[], options?: BertWordPieceTrainOptions): Promise<void> {
    const mergedOptions = { ...this.defaultTrainOptions, ...options };
    const trainer = wordPieceTrainer(mergedOptions);
    this.tokenizer.train(trainer, files);
  }
 }
--- a/bindings/node/lib/implementations/bpe.tokenizer.ts
+++ b/bindings/node/lib/implementations/bpe.tokenizer.ts
@ -1,52 +1,126 @@
 import { promisify } from "util";
 import { BaseTokenizer } from "./base.tokenizer";
-import { Model, models } from "../bindings/models";
+import { Model, bpe } from "../bindings/models";
 import { Tokenizer } from "../bindings/tokenizer";
 import { sequenceNormalizer, nfkcNormalizer, lowercaseNormalizer } from "../bindings/normalizers";
 import { whitespacePreTokenizer } from "../bindings/pre-tokenizers";
 import { bpeDecoder } from "../bindings/decoders";
 import { bpeTrainer } from "../bindings/trainers";
-interface BPEOptions {
+export interface BPETokenizerOptions {
  dropout?:    number;
  mergesFile?: string;
  /**
   * @default "</w>"
   */
  suffix?:     string;
  /**
   * @default "<unk>"
   */
  unkToken?:   string;
  vocabFile?:  string;
 }
-const defaultBPEOptions: BPEOptions & Required<Pick<BPEOptions, 'unkToken' | 'suffix'>> = {
+export interface BPETokenizerTrainOptions {
-  suffix:   '</w>',
+  /**
-  unkToken: '<unk>'
+   * @default []
-};
+   */
-
+  initialAlphabet?: string[];
-/**
+  /**
- * Instantiate and returns a new BPE tokenizer
+   * @default 1000
- * @param options 
+   */
- */
+  limitAlphabet?:   number;
-export async function getBPETokenizer(options?: BPEOptions): Promise<BPETokenizer> {
+  /**
-  const mergedOptions = { ...defaultBPEOptions, ...options };
+   * @default 2
-
+   */
-  let model: Model;
+  minFrequency?:    number;
-  if (mergedOptions.vocabFile && mergedOptions.mergesFile) {
+  /**
-    const fromFiles = promisify(models.BPE.fromFiles);
+   * @default true
-    const modelOptions: models.BPE.BPEOptions = {
+   */
-      dropout:         mergedOptions.dropout,
+  showProgress?:    boolean;
-      endOfWordSuffix: mergedOptions.suffix,
+  /**
-      unkToken:        mergedOptions.unkToken
+   * @default ["<unk>"]
-    };
+   */
-
+  specialTokens?:   string[];
-    model = await fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, modelOptions);
+  /**
-  } else {
+   * @default "</w>"
-    model = models.BPE.empty();
+   */
-  }
+  suffix?:          string;
-
+  /**
-  const tokenizer = new Tokenizer(model);
+   * @default 30000
-  return new BPETokenizer(tokenizer);
+   */
  vocabSize?:       number;
 }
 /**
 * Original BPE Tokenizer.
 * Represents the BPE algorithm, as introduced by Rico Sennrich (https://arxiv.org/abs/1508.07909)
 */
-class BPETokenizer extends BaseTokenizer {
+export class BPETokenizer extends BaseTokenizer {
-  constructor(tokenizer: Tokenizer) {
+  private static readonly defaultBPEOptions:
    BPETokenizerOptions & Required<Pick<BPETokenizerOptions, "unkToken" | "suffix">> = {
    suffix:   "</w>",
    unkToken: "<unk>"
  };
  private readonly defaultTrainOptions: Required<BPETokenizerTrainOptions> = {
    initialAlphabet: [],
    limitAlphabet:   1000,
    minFrequency:    2,
    showProgress:    true,
    specialTokens:   ["<unk>"],
    suffix:          "</w>",
    vocabSize:       30000
  };
  private constructor(tokenizer: Tokenizer) {
    super(tokenizer);
  }
  /**
   * Instantiate and returns a new BPE tokenizer
   * @param [options] Optional tokenizer options
   */
  static async fromOptions(options?: BPETokenizerOptions): Promise<BPETokenizer> {
    const mergedOptions = { ...this.defaultBPEOptions, ...options };
    let model: Model;
    if (mergedOptions.vocabFile && mergedOptions.mergesFile) {
      // const fromFiles = promisify(BPE.fromFiles);
      const modelOptions: bpe.BPEModelOptions = {
        dropout:         mergedOptions.dropout,
        endOfWordSuffix: mergedOptions.suffix,
        unkToken:        mergedOptions.unkToken
      };
      model = bpe.fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, modelOptions);
      // model = await fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, modelOptions);
    } else {
      model = bpe.empty();
    }
    const tokenizer = new Tokenizer(model);
    const normalizer = sequenceNormalizer([nfkcNormalizer(), lowercaseNormalizer()]);
    tokenizer.setNormalizer(normalizer);
    tokenizer.setPreTokenizer(whitespacePreTokenizer());
    const decoder = bpeDecoder(mergedOptions.suffix);
    tokenizer.setDecoder(decoder);
    return new BPETokenizer(tokenizer);
  }
  /**
   * Train the model using the given files
   *
   * @param files Files to use for training
   * @param [options] Training options
   */
  async train(files: string[], options?: BPETokenizerTrainOptions): Promise<void> {
    const mergedOptions = { ...this.defaultTrainOptions, ...options };
    const trainer = bpeTrainer(mergedOptions);
    this.tokenizer.train(trainer, files);
  }
 }
--- a/bindings/node/lib/implementations/byte-level-bpe.tokenizer.ts
+++ b/bindings/node/lib/implementations/byte-level-bpe.tokenizer.ts
@ -0,0 +1,91 @@
 import { BaseTokenizer } from "./base.tokenizer";
 import { Tokenizer } from "../bindings/tokenizer";
 import { Model, bpe } from "../bindings/models";
 import { nfkcNormalizer } from "../bindings/normalizers";
 import { byteLevelPreTokenizer, byteLevelAlphabet } from "../bindings/pre-tokenizers";
 import { byteLevelDecoder } from "../bindings/decoders";
 import { bpeTrainer } from "../bindings/trainers";
 export interface ByteLevelBPETokenizerOptions {
  /**
   * @default false
   */
  addPrefixSpace?: boolean;
  mergesFile?:     string;
  vocabFile?:      string;
 }
 export interface ByteLevelBPETrainOptions {
  /**
   * @default 2
   */
  minFrequency?:  number;
  /**
   * @default true
   */
  showProgress?:  boolean;
  /**
   * @default []
   */
  specialTokens?: string[];
  /**
   * @default 30000
   */
  vocabSize?:     number;
 }
 /**
 * Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
 */
 export class ByteLevelBPETokenizer extends BaseTokenizer {
  private static readonly defaultOptions: 
    ByteLevelBPETokenizerOptions & Required<Pick<ByteLevelBPETokenizerOptions, 'addPrefixSpace'>> = {
      addPrefixSpace: false
  };
  private readonly defaultTrainOptions: Required<ByteLevelBPETrainOptions> = {
    minFrequency:    2,
    showProgress:    true,
    specialTokens:   ['<unk>'],
    vocabSize:       30000
  };
  private constructor(tokenizer: Tokenizer) {
    super(tokenizer);
  }
  static async fromOptions(options?: ByteLevelBPETokenizerOptions): Promise<ByteLevelBPETokenizer> {
    const mergedOptions = { ...this.defaultOptions, ...options };
    let model: Model;
    if (mergedOptions.vocabFile && mergedOptions.mergesFile) {
      // const fromFiles = promisify(BPE.fromFiles);
      model = bpe.fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile);
      // model = await fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, null);
    } else {
      model = bpe.empty();
    }
    const tokenizer = new Tokenizer(model);
    tokenizer.setNormalizer(nfkcNormalizer());
    const preTokenizer = byteLevelPreTokenizer(mergedOptions.addPrefixSpace);
    tokenizer.setPreTokenizer(preTokenizer);
    tokenizer.setDecoder(byteLevelDecoder());
    return new ByteLevelBPETokenizer(tokenizer);
  }
  /**
   * Train the model using the given files
   *
   * @param files Files to use for training
   * @param [options] Training options
   */
  async train(files: string[], options?: ByteLevelBPETrainOptions): Promise<void> {
    const mergedOptions = { ...this.defaultTrainOptions, ...options };
    const trainer = bpeTrainer({ ...mergedOptions, initialAlphabet: byteLevelAlphabet() });
    this.tokenizer.train(trainer, files);
  }
 }
--- a/bindings/node/lib/implementations/index.ts
+++ b/bindings/node/lib/implementations/index.ts
@ -1,2 +1,4 @@
 export * from './bert-wordpiece.tokenizer';
 export * from './bpe.tokenizer';
 export * from './byte-level-bpe.tokenizer';
 export * from './sentence-piece.tokenizer';
--- a/bindings/node/lib/implementations/sentence-piece.tokenizer.ts
+++ b/bindings/node/lib/implementations/sentence-piece.tokenizer.ts
@ -0,0 +1,121 @@
 import { BaseTokenizer } from "./base.tokenizer";
 import { Tokenizer } from "../bindings/tokenizer";
 import { Model, bpe } from "../bindings/models";
 import { nfkcNormalizer } from "../bindings/normalizers";
 import { metaspacePreTokenizer } from "../bindings/pre-tokenizers";
 import { metaspaceDecoder } from "../bindings/decoders";
 import { bpeTrainer } from "../bindings/trainers";
 export interface SentencePieceTokenizerOptions extends OptionsWithDefaults {
  dropout?:    number;
  mergesFile?: string;
  vocabFile?:  string;
 }
 interface OptionsWithDefaults {
  /**
   * @default true
   */
  addPrefixSpace?: boolean;
  /**
   * @default "▁"
   */
  replacement?:    string;
  /**
   * @default "<unk>"
   */
  unkToken?:       string;
 }
 export interface SentencePieceTrainOptions {
  /**
   * @default []
   */
  initialAlphabet?: string[];
  /**
   * @default 1000
   */
  limitAlphabet?:   number;
  /**
   * @default 2
   */
  minFrequency?:    number;
  /**
   * @default true
   */
  showProgress?:    boolean;
  /**
   * @default ["<unk>"]
   */
  specialTokens?:   string[];
  /**
   * @default 30000
   */
  vocabSize?:       number;
 }
 /**
 * Represents the BPE algorithm, with the pretokenization used by SentencePiece
 */
 export class SentencePieceTokenizer extends BaseTokenizer {
  private static readonly defaultOptions: SentencePieceTokenizerOptions & Required<OptionsWithDefaults> = {
    addPrefixSpace: true,
    replacement:    '▁',
    unkToken:       '<unk>'
  };
  private readonly defaultTrainOptions: Required<SentencePieceTrainOptions> = {
    initialAlphabet: [],
    limitAlphabet:   1000,
    minFrequency:    2,
    showProgress:    true,
    specialTokens:   ['<unk>'],
    vocabSize:       30000
  };
  private constructor(tokenizer: Tokenizer) {
    super(tokenizer);
  }
  static async fromOptions(options?: SentencePieceTokenizerOptions): Promise<SentencePieceTokenizer> {
    const mergedOptions = { ...this.defaultOptions, ...options };
    let model: Model;
    if (mergedOptions.vocabFile && mergedOptions.mergesFile) {
      // const fromFiles = promisify(BPE.fromFiles);
      const modelOptions: bpe.BPEModelOptions = {
        dropout:  mergedOptions.dropout,
        unkToken: mergedOptions.unkToken
      };
      model = bpe.fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, modelOptions);
      // model = await fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, null);
    } else {
      model = bpe.empty();
    }
    const tokenizer = new Tokenizer(model);
    tokenizer.setNormalizer(nfkcNormalizer());
    const preTokenizer = metaspacePreTokenizer(mergedOptions.replacement, mergedOptions.addPrefixSpace);
    tokenizer.setPreTokenizer(preTokenizer);
    const decoder = metaspaceDecoder(mergedOptions.replacement, mergedOptions.addPrefixSpace);
    tokenizer.setDecoder(decoder);
    return new SentencePieceTokenizer(tokenizer);
  }
  /**
   * Train the model using the given files
   *
   * @param files Files to use for training
   * @param [options] Training options
   */
  async train(files: string[], options?: SentencePieceTrainOptions): Promise<void> {
    const mergedOptions = { ...this.defaultTrainOptions, ...options };
    const trainer = bpeTrainer(mergedOptions);
    this.tokenizer.train(trainer, files);
  }
 }
--- a/bindings/node/lib/index.d.ts
+++ b/bindings/node/lib/index.d.ts
@ -1,2 +1 @@
 export * from './bindings';
 export * from './implementations';
--- a/bindings/node/lib/index.js
+++ b/bindings/node/lib/index.js
@ -3,5 +3,5 @@ function __export(m) {
    for (var p in m) if (!exports.hasOwnProperty(p)) exports[p] = m[p];
 }
 Object.defineProperty(exports, "__esModule", { value: true });
-__export(require("./bindings"));
+// export * from './bindings';
 __export(require("./implementations"));
--- a/bindings/node/lib/index.ts
+++ b/bindings/node/lib/index.ts
@ -1,2 +1,2 @@
-export * from './bindings';
+// export * from './bindings';
 export * from './implementations';
		`@ -0,0 +1,3 @@`
							`var addon = require('../../native');`

							`module.exports.Tokenizer = addon.tokenizer_Tokenizer;`
`@ -1,2 +1 @@`
	`export * from './bindings';`
	`export * from './implementations';`	`export * from './implementations';`
`@ -1,2 +1,2 @@`
	`export * from './bindings';`	`// export * from './bindings';`
	`export * from './implementations';`	`export * from './implementations';`