big big big

2025-08-22 16:25:30 +00:00 · 2020-01-10 14:49:13 -05:00
parent 34875d5771
commit 80f6d58177
24 changed files with 762 additions and 345 deletions
--- a/bindings/node/lib/bindings/decoders.d.ts
+++ b/bindings/node/lib/bindings/decoders.d.ts
@ -4,32 +4,30 @@
 */
 declare class Decoder {}

-export namespace decoders {
-  /**
-   * Instantiate a new ByteLevel Decoder
-   */
-  export function ByteLevel(): Decoder;
+/**
+ * Instantiate a new ByteLevel Decoder
+ */
+export function byteLevelDecoder(): Decoder;

-  /**
-   * Instantiate a new WordPiece Decoder
-   * @param {string} [prefix='##'] The prefix to use for subwords that are not a beginning-of-word
-   */
-  export function WordPiece(prefix?: string): Decoder;
+/**
+ * Instantiate a new WordPiece Decoder
+ * @param {string} [prefix='##'] The prefix to use for subwords that are not a beginning-of-word
+ */
+export function wordPieceDecoder(prefix?: string): Decoder;

-  /**
-   * Instantiate a new Metaspace
-   *
-   * @param {string} [replacement='▁'] The replacement character. 
-   * Must be exactly one character. By default we use the `▁` (U+2581) meta symbol (same as in SentencePiece).
-   * @param {boolean} [addPrefixSpace=true] Whether to add a space to the first word if there isn't already one.
-   * This lets us treat `hello` exactly like `say hello`.
-   */
-  export function Metaspace(replacement?: string, addPrefixSpace?: boolean): Decoder;
+/**
+ * Instantiate a new Metaspace
+ *
+ * @param {string} [replacement='▁'] The replacement character. 
+ * Must be exactly one character. By default we use the `▁` (U+2581) meta symbol (same as in SentencePiece).
+ * @param {boolean} [addPrefixSpace=true] Whether to add a space to the first word if there isn't already one.
+ * This lets us treat `hello` exactly like `say hello`.
+ */
+export function metaspaceDecoder(replacement?: string, addPrefixSpace?: boolean): Decoder;

-  /**
-   * Instantiate a new BPEDecoder
-   * @param {string} [suffix='</w>'] The suffix that was used to caracterize an end-of-word.
-   * This suffix will be replaced by whitespaces during the decoding
-   */
-  export function BPEDecoder(suffix?: string): Decoder;
-}
+/**
+ * Instantiate a new BPE Decoder
+ * @param {string} [suffix='</w>'] The suffix that was used to caracterize an end-of-word.
+ * This suffix will be replaced by whitespaces during the decoding
+ */
+export function bpeDecoder(suffix?: string): Decoder;
--- a/bindings/node/lib/bindings/decoders.js
+++ b/bindings/node/lib/bindings/decoders.js
@ -0,0 +1,8 @@
+var addon = require('../../native');
+
+module.exports = {
+  byteLevelDecoder: addon.decoders_ByteLevel,
+  wordPieceDecoder: addon.decoders_WordPiece,
+  metaspaceDecoder: addon.decoders_Metaspace,
+  bpeDecoder:       addon.decoders_BPEDecoder
+};
--- a/bindings/node/lib/bindings/index.d.ts
+++ b/bindings/node/lib/bindings/index.d.ts
@ -1,7 +0,0 @@
-export { decoders } from './decoders';
-export { models } from './models';
-export { normalizers } from './normalizers';
-export { preTokenizers } from './pre-tokenizers';
-export { postProcessors } from './post-processors';
-export { Tokenizer } from './tokenizer';
-export { trainers } from './trainers'
--- a/bindings/node/lib/bindings/index.js
+++ b/bindings/node/lib/bindings/index.js
@ -1,19 +0,0 @@
-var addon = require('../../native');
-
-exports.Tokenizer = addon.tokenizer_Tokenizer;
-exports.models = {
-	BPE: {
-		fromFiles: addon.models_BPE_from_files,
-		empty: addon.models_BPE_empty,
-	},
-	WordPiece: addon.models_WordPiece,
-}
-exports.decoders = {
-	ByteLevel: addon.decoders_ByteLevel,
-	WordPiece: addon.decoders_WordPiece,
-	Metaspace: addon.decoders_Metaspace,
-	BPEDecoder: addon.decoders_BPEDecoder,
-}
-exports.post_processors = {
-	BertProcessing: addon.processors_BertProcessing,
-}
--- a/bindings/node/lib/bindings/models.d.ts
+++ b/bindings/node/lib/bindings/models.d.ts
@ -6,73 +6,103 @@ declare class Model {

 }

-export namespace models {
-  export namespace BPE {
-    export interface BPEOptions {
-      /**
-       * The number of words that the BPE cache can contain. The cache allows
-       * to speed-up the process by keeping the result of the merge operations
-       * for a number of words.
-       */
-      cacheCapacity?: number;
-      /**
-       * The BPE dropout to use. Must be an float between 0 and 1
-       */
-      dropout?: number;
-      /**
-       * The unknown token to be used by the model
-       */
-      unkToken?: string;
-      /**
-       * The prefix to attach to subword units that don't represent a beginning of word
-       */
-      continuingSubwordPrefix?: string;
-      /**
-       * The suffix to attach to subword units that represent an end of word
-       */
-      endOfWordSuffix?: string;
-    }
-
+export namespace bpe {
+  export interface BPEModelOptions {
    /**
-     * Instantiate a BPE model from the given vocab and merges files
-     *
-     * @param {string} vocab Path to a vocabulary JSON file
-     * @param {string} merges Path to a merge file
-     * @param {BPEOptions} [options] BPE model options
-     * @param {(err: any, model: Model) => void} __callback Callback called when model is loaded
+     * The number of words that the BPE cache can contain. The cache allows
+     * to speed-up the process by keeping the result of the merge operations
+     * for a number of words.
     */
-    export function fromFiles(
-      vocab: string,
-      merges: string,
-      options: BPEOptions | null,
-      __callback: (err: any, model: Model) => void
-    ): void;
-
+    cacheCapacity?: number;
    /**
-     * Instantiate an empty BPE Model
+     * The BPE dropout to use. Must be an float between 0 and 1
     */
-    export function empty(): Model;
+    dropout?: number;
+    /**
+     * The unknown token to be used by the model
+     */
+    unkToken?: string;
+    /**
+     * The prefix to attach to subword units that don't represent a beginning of word
+     */
+    continuingSubwordPrefix?: string;
+    /**
+     * The suffix to attach to subword units that represent an end of word
+     */
+    endOfWordSuffix?: string;
  }

-  export namespace WordPiece {
-    /**
-     * Instantiate a WordPiece model from the given vocab file
-     *
-     * @param {string} vocab Path to a vocabulary file
-     * @param {string} [unkToken] The unknown token to be used by the model
-     * @param {number} [maxInputCharsPerWord] The maximum number of characters to authorize in a single word
-     * @param {(err: any, model: Model) => void} __callback Callback called when model is loaded
-     */
-    export function fromFiles(
-      vocab: string,
-      unkToken: string,
-      maxInputCharsPerWord: number | null,
-      __callback: (err: any, model: Model) => void
-    ): void;
+  /**
+   * Instantiate a BPE model from the given vocab and merges files
+   *
+   * @param vocab Path to a vocabulary JSON file
+   * @param merges Path to a merge file
+   * @param [options] BPE model options
+   */
+  export function fromFiles(
+    vocab: string,
+    merges: string,
+    options?: BPEModelOptions
+  ): Model;

-    /**
-     * Instantiate an empty WordPiece model
-     */
-    export function empty(): Model;
-  }
+  /**
+   * Instantiate a BPE model from the given vocab and merges files
+   *
+   * @param vocab Path to a vocabulary JSON file
+   * @param merges Path to a merge file
+   * @param options BPE model options
+   * @param __callback Callback called when model is loaded
+   */
+  // export function fromFiles(
+  //   vocab: string,
+  //   merges: string,
+  //   options: BPEModelOptions | null,
+  //   __callback: (err: any, model: Model) => void
+  // ): void;
+
+  /**
+   * Instantiate an empty BPE Model
+   */
+  export function empty(): Model;
+}
+
+export namespace wordPiece {
+  export interface WordPieceModelOptions {
+    /**
+     * The maximum number of characters to authorize in a single word.
+     * @default 100
+     */
+    maxInputCharsPerWord?: number;
+    /**
+     * The unknown token to be used by the model.
+     * @default "[UNK]"
+     */
+    unkToken?:             string;
+  }
+
+  /**
+   * Instantiate a WordPiece model from the given vocab file
+   *
+   * @param {string} vocab Path to a vocabulary file
+   * @param [options] WordPiece model options
+   */
+  export function fromFiles(vocab: string, options?: WordPieceModelOptions): Model;
+
+  /**
+   * Instantiate a WordPiece model from the given vocab file
+   *
+   * @param vocab Path to a vocabulary file
+   * @param options WordPiece model options
+   * @param __callback Callback called when model is loaded
+   */
+  // export function fromFiles(
+  //   vocab: string,
+  //   options: WordPieceModelOptions | null,
+  //   __callback: (err: any, model: Model) => void
+  // ): void;
+
+  /**
+   * Instantiate an empty WordPiece model
+   */
+  export function empty(): Model;
 }
--- a/bindings/node/lib/bindings/models.js
+++ b/bindings/node/lib/bindings/models.js
@ -0,0 +1,12 @@
+var addon = require('../../native');
+
+module.exports = {
+	bpe: {
+		fromFiles: addon.models_BPE_from_files,
+		empty: addon.models_BPE_empty,
+	},
+	wordPiece: {
+		fromFiles: addon.models_WordPiece_from_files,
+		empty: addon.models_WordPiece_empty,
+	}
+}
--- a/bindings/node/lib/bindings/normalizers.d.ts
+++ b/bindings/node/lib/bindings/normalizers.d.ts
@ -4,68 +4,66 @@
 */
 declare class Normalizer {}

-export namespace normalizers {
-  interface BertNormalizerOptions {
-    /**
-     * Whether to clean the text, by removing any control characters
-     * and replacing all whitespaces by the classic one.
-     * @default true
-     */
-    cleanText?:          boolean;
-    /**
-     * Whether to handle chinese chars by putting spaces around them.
-     * @default true
-     */
-    handleChineseChars?: boolean;
-    /**
-     * Whether to lowercase.
-     * @default true
-     */
-    lowercase?:          boolean;
-    /**
-     * Whether to strip all accents.
-     * @default true
-     */
-    stripAccents?:       boolean;
-  }
-
+interface BertNormalizerOptions {
  /**
-   * Instantiate a Bert Normalizer with the given options
-   *
-   * @param [options] Normalizer options
-   * @returns {Normalizer} Bert Normalizer. Takes care of normalizing raw text before giving it to a Bert model.
-   * This includes cleaning the text, handling accents, chinese chars and lowercasing
+   * Whether to clean the text, by removing any control characters
+   * and replacing all whitespaces by the classic one.
+   * @default true
   */
-  export function bertNormalizer(options?: BertNormalizerOptions): Normalizer;
-
+  cleanText?:          boolean;
  /**
-   * Returns a new NFD Unicode Normalizer
+   * Whether to handle chinese chars by putting spaces around them.
+   * @default true
   */
-  export function nfd(): Normalizer;
-
+  handleChineseChars?: boolean;
  /**
-   * Returns a new NFKD Unicode Normalizer
+   * Whether to lowercase.
+   * @default true
   */
-  export function nfkd(): Normalizer;
-
+  lowercase?:          boolean;
  /**
-   * Returns a new NFC Unicode Normalizer
+   * Whether to strip all accents.
+   * @default true
   */
-  export function nfc(): Normalizer;
-
-  /**
-   * Returns a new NFKC Unicode Normalizer
-   */
-  export function nfkc(): Normalizer;
-
-  /**
-   * Instantiate a new Normalization Sequence using the given normalizers
-   * @param normalizers A list of Normalizer to be run as a sequence
-   */
-  export function sequence(normalizers: Normalizer[]): Normalizer;
-
-  /**
-   * Returns a new Lowercase Normalizer
-   */
-  export function lowercase(): Normalizer;
+  stripAccents?:       boolean;
 }
+
+/**
+ * Instantiate a Bert Normalizer with the given options
+ *
+ * @param [options] Normalizer options
+ * @returns {Normalizer} Bert Normalizer. Takes care of normalizing raw text before giving it to a Bert model.
+ * This includes cleaning the text, handling accents, chinese chars and lowercasing
+ */
+export function bertNormalizer(options?: BertNormalizerOptions): Normalizer;
+
+/**
+ * Returns a new NFD Unicode Normalizer
+ */
+export function nfdNormalizer(): Normalizer;
+
+/**
+ * Returns a new NFKD Unicode Normalizer
+ */
+export function nfkdNormalizer(): Normalizer;
+
+/**
+ * Returns a new NFC Unicode Normalizer
+ */
+export function nfcNormalizer(): Normalizer;
+
+/**
+ * Returns a new NFKC Unicode Normalizer
+ */
+export function nfkcNormalizer(): Normalizer;
+
+/**
+ * Instantiate a new Normalization Sequence using the given normalizers
+ * @param normalizers A list of Normalizer to be run as a sequence
+ */
+export function sequenceNormalizer(normalizers: Normalizer[]): Normalizer;
+
+/**
+ * Returns a new Lowercase Normalizer
+ */
+export function lowercaseNormalizer(): Normalizer;
--- a/bindings/node/lib/bindings/normalizers.js
+++ b/bindings/node/lib/bindings/normalizers.js
@ -0,0 +1,11 @@
+var addon = require('../../native');
+
+module.exports = {
+  bertNormalizer:      addon.normalizers_BertNormalizer,
+  nfdNormalizer:       addon.normalizers_NFD,
+  nfkdNormalizer:      addon.normalizers_NFKD,
+  nfcNormalizer:       addon.normalizers_NFC,
+  nfkcNormalizer:      addon.normalizers_NFKC,
+  sequenceNormalizer:  addon.normalizers_Sequence,
+  lowercaseNormalizer: addon.normalizers_Lowercase
+};
--- a/bindings/node/lib/bindings/post-processors.d.ts
+++ b/bindings/node/lib/bindings/post-processors.d.ts
@ -4,12 +4,10 @@
 */
 declare class PostProcessor {}

-export namespace postProcessors {
-  /**
-   * Instantiate a new BertProcessing with the given tokens
-   *
-   * @param {[string, number]} sep A tuple with the string representation of the SEP token, and its id
-   * @param {[string, number]} cls A tuple with the string representation of the CLS token, and its id
-   */
-  export function BertProcessing(sep: [string, number], cls: [string, number]): PostProcessor;
-}
+/**
+ * Instantiate a new BertProcessing with the given tokens
+ *
+ * @param {[string, number]} sep A tuple with the string representation of the SEP token, and its id
+ * @param {[string, number]} cls A tuple with the string representation of the CLS token, and its id
+ */
+export function bertProcessing(sep: [string, number], cls: [string, number]): PostProcessor;
--- a/bindings/node/lib/bindings/post-processors.js
+++ b/bindings/node/lib/bindings/post-processors.js
@ -0,0 +1,5 @@
+var addon = require('../../native');
+
+module.exports = {
+  bertProcessing: addon.processors_BertProcessing
+};
--- a/bindings/node/lib/bindings/pre-tokenizers.d.ts
+++ b/bindings/node/lib/bindings/pre-tokenizers.d.ts
@ -4,50 +4,46 @@
 */
 declare class PreTokenizer {}

-export namespace preTokenizers {
-  export namespace byteLevel {
-    /**
-     * Instantiate a new ByteLevel PreTokenizer
-     *
-     * @param {boolean} [addPrefixSpace=true] Whether to add a space to the first word if there isn't already one.
-     * This lets us treat `hello` exactly like `say hello`.
-     * @returns {PreTokenizer} ByteLevel PreTokenizer. 
-     * This pre-tokenizer takes care of replacing all bytes of the given string
-     * with a corresponding representation, as well as splitting into words.
-     */
-    export function byteLevel(addPrefixSpace?: boolean): PreTokenizer;
-  
-    /**
-     * Returns the alphabet used by the ByteLevel PreTokenizer.
-     * Since the ByteLevel works as its name suggests, at the byte level, it
-     * encodes any byte to one visible character. This means that there is a
-     * total of 256 different characters composing this alphabet.
-     */
-    export function byteLevelAlphabet(): string[];
-  }
+/**
+ * Instantiate a new ByteLevel PreTokenizer
+ *
+ * @param {boolean} [addPrefixSpace=true] Whether to add a space to the first word if there isn't already one.
+ * This lets us treat `hello` exactly like `say hello`.
+ * @returns {PreTokenizer} ByteLevel PreTokenizer. 
+ * This pre-tokenizer takes care of replacing all bytes of the given string
+ * with a corresponding representation, as well as splitting into words.
+ */
+export function byteLevelPreTokenizer(addPrefixSpace?: boolean): PreTokenizer;

-  /**
-   * Returns a Whitespace PreTokenizer
-   * This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
-   */
-  export function whitespace(): PreTokenizer;
+/**
+ * Returns the alphabet used by the ByteLevel PreTokenizer.
+ * Since the ByteLevel works as its name suggests, at the byte level, it
+ * encodes any byte to one visible character. This means that there is a
+ * total of 256 different characters composing this alphabet.
+ */
+export function byteLevelAlphabet(): string[];

-  /**
-   * Returns a new Bert PreTokenizer.
-   * This pre-tokenizer splits tokens on spaces, and also on punctuation.
-   * Each occurence of a punctuation character will be treated separately.
-   */
-  export function bertPreTokenizer(): PreTokenizer;
+/**
+ * Returns a Whitespace PreTokenizer
+ * This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
+ */
+export function whitespacePreTokenizer(): PreTokenizer;

-  /**
-   * Returns a new Metaspace Tokenizer.
-   * This pre-tokenizer replaces any whitespace by the provided replacement character.
-   * It then tries to split on these spaces.
-   *
-   * @param {string} [replacement="▁"] The replacement character. Must be exactly one character.
-   * By default we use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
-   * @param {boolean} [addPrefixSpace] Whether to add a space to the first word if there isn't already one.
-   * This lets us treat `hello` exactly like `say hello`.
-   */
-  export function metaspace(replacement?: string, addPrefixSpace?: boolean): PreTokenizer;
-}
+/**
+ * Returns a new Bert PreTokenizer.
+ * This pre-tokenizer splits tokens on spaces, and also on punctuation.
+ * Each occurence of a punctuation character will be treated separately.
+ */
+export function bertPreTokenizer(): PreTokenizer;
+
+/**
+ * Returns a new Metaspace Tokenizer.
+ * This pre-tokenizer replaces any whitespace by the provided replacement character.
+ * It then tries to split on these spaces.
+ *
+ * @param {string} [replacement="▁"] The replacement character. Must be exactly one character.
+ * By default we use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
+ * @param {boolean} [addPrefixSpace] Whether to add a space to the first word if there isn't already one.
+ * This lets us treat `hello` exactly like `say hello`.
+ */
+export function metaspacePreTokenizer(replacement?: string, addPrefixSpace?: boolean): PreTokenizer;
--- a/bindings/node/lib/bindings/pre-tokenizers.js
+++ b/bindings/node/lib/bindings/pre-tokenizers.js
@ -0,0 +1,9 @@
+var addon = require('../../native');
+
+module.exports = {
+  byteLevelPreTokenizer:  addon.pre_tokenizers_ByteLevel,
+  byteLevelAlphabet:      addon.pre_tokenizers_ByteLevel_Alphabet,
+  whitespacePreTokenizer: addon.pre_tokenizers_Whitespace,
+  bertPreTokenizer:       addon.pre_tokenizers_BertPreTokenizer,
+  metaspacePreTokenizer:  addon.pre_tokenizers_Metaspace
+};
--- a/bindings/node/lib/bindings/tokenizer.js
+++ b/bindings/node/lib/bindings/tokenizer.js
@ -0,0 +1,3 @@
+var addon = require('../../native');
+
+module.exports.Tokenizer = addon.tokenizer_Tokenizer;
--- a/bindings/node/lib/bindings/trainers.d.ts
+++ b/bindings/node/lib/bindings/trainers.d.ts
@ -4,59 +4,57 @@
 */
 declare class Trainer {}

-export namespace trainers {
-  interface TrainerOptions {
-    /**
-     * A prefix to be used for every subword that is not a beginning-of-word.
-     */
-    continuingSubwordPrefix?: string;
-    /**
-     * A suffix to be used for every subword that is a end-of-word.
-     */
-    endOfWordSuffix?:         string;
-    /**
-     * A list of characters to include in the initial alphabet, even
-     * if not seen in the training dataset.
-     * If the strings contains more than one character, only the first one
-     * is kept.
-     * @default []
-     */
-    initialAlphabet?:         string[];
-    /**
-     * The maximum different characters to keep in the alphabet.
-     */
-    limitAlphabet?:           number;
-    /**
-     * The minimum frequency a pair should have in order to be merged.
-     * @default 2
-     */
-    minFrequency?:            number;
-    /**
-     * Whether to show progress bars while training.
-     * @default true
-     */
-    showProgress?:            boolean;
-    /**
-     * A list of special tokens the model should know of.
-     * @default []
-     */
-    specialTokens?:           string[];
-    /**
-     * The size of the final vocabulary, including all tokens and alphabet.
-     * @default 30000
-     */
-    vocabSize?:               number;
-  }
-
+interface TrainerOptions {
  /**
-   * Instantiate a new BPE Trainer
-   * @param {TrainerOptions} [options] BPE Trainer options
+   * A prefix to be used for every subword that is not a beginning-of-word.
   */
-  export function bpeTrainer(options?: TrainerOptions): Trainer;
-
+  continuingSubwordPrefix?: string;
  /**
-   * Instantiate a new WordPiece Trainer
-   * @param {TrainerOptions} [options] WordPiece Trainer options
+   * A suffix to be used for every subword that is a end-of-word.
   */
-  export function wordPieceTrainer(options?: TrainerOptions): Trainer;
+  endOfWordSuffix?:         string;
+  /**
+   * A list of characters to include in the initial alphabet, even
+   * if not seen in the training dataset.
+   * If the strings contains more than one character, only the first one
+   * is kept.
+   * @default []
+   */
+  initialAlphabet?:         string[];
+  /**
+   * The maximum different characters to keep in the alphabet.
+   */
+  limitAlphabet?:           number;
+  /**
+   * The minimum frequency a pair should have in order to be merged.
+   * @default 2
+   */
+  minFrequency?:            number;
+  /**
+   * Whether to show progress bars while training.
+   * @default true
+   */
+  showProgress?:            boolean;
+  /**
+   * A list of special tokens the model should know of.
+   * @default []
+   */
+  specialTokens?:           string[];
+  /**
+   * The size of the final vocabulary, including all tokens and alphabet.
+   * @default 30000
+   */
+  vocabSize?:               number;
 }
+
+/**
+ * Instantiate a new BPE Trainer
+ * @param {TrainerOptions} [options] BPE Trainer options
+ */
+export function bpeTrainer(options?: TrainerOptions): Trainer;
+
+/**
+ * Instantiate a new WordPiece Trainer
+ * @param {TrainerOptions} [options] WordPiece Trainer options
+ */
+export function wordPieceTrainer(options?: TrainerOptions): Trainer;
--- a/bindings/node/lib/bindings/trainers.js
+++ b/bindings/node/lib/bindings/trainers.js
@ -0,0 +1,6 @@
+var addon = require('../../native');
+
+module.exports = {
+  bpeTrainer:       addon.trainers_BPETrainer,
+  wordPieceTrainer: addon.trainers_WordPieceTrainer
+};
--- a/bindings/node/lib/implementations/base.tokenizer.ts
+++ b/bindings/node/lib/implementations/base.tokenizer.ts
@ -2,7 +2,7 @@ import { promisify } from "util";
 import { Encoding, Tokenizer } from "../bindings/tokenizer";

 export class BaseTokenizer {
-  constructor(private tokenizer: Tokenizer) {}
+  constructor(protected tokenizer: Tokenizer) {}

  /**
   * Encode the given sequence
@ -11,7 +11,7 @@ export class BaseTokenizer {
   * @param {(string | null)} pair The optional pair sequence
   */
  async encode(sequence: string, pair?: string): Promise<Encoding> {
-    const encode = promisify(this.tokenizer.encode);
+    const encode = promisify(this.tokenizer.encode.bind(this.tokenizer));
    return encode(sequence, pair ?? null);
  }

@ -22,7 +22,7 @@ export class BaseTokenizer {
   * The list can contain both at the same time.
   */
  async encodeBatch(sequences: (string | [string, string])[]): Promise<Encoding[]> {
-    const encodeBatch = promisify(this.tokenizer.encodeBatch);
+    const encodeBatch = promisify(this.tokenizer.encodeBatch.bind(this.tokenizer));
    return encodeBatch(sequences);
  }
 }
--- a/bindings/node/lib/implementations/bert-wordpiece.tokenizer.ts
+++ b/bindings/node/lib/implementations/bert-wordpiece.tokenizer.ts
@ -1,9 +1,14 @@
 import { promisify } from "util";
 import { BaseTokenizer } from "./base.tokenizer";
 import { Tokenizer } from "../bindings/tokenizer";
-import { Model, models } from "../bindings/models";
+import { Model, wordPiece } from "../bindings/models";
+import { bertNormalizer } from "../bindings/normalizers";
+import { bertPreTokenizer } from "../bindings/pre-tokenizers";
+import { bertProcessing } from "../bindings/post-processors";
+import { wordPieceDecoder } from "../bindings/decoders";
+import { wordPieceTrainer } from "../bindings/trainers";

-interface BertWordpieceOptions {
+export interface BertWordPieceOptions {
  /**
   * @default true
   */
@ -43,42 +48,121 @@ interface BertWordpieceOptions {
  wordpiecesPrefix?:   string;
 }

-const defaultBertOptions: Required<Omit<BertWordpieceOptions, 'vocabFile'>> & { vocabFile?: string } = {
-  addSpecialTokens:   true,
-  cleanText:          true,
-  clsToken:           '[CLS]',
-  handleChineseChars: true,
-  lowercase:          true,
-  sepToken:           '[SEP]',
-  stripAccents:       true,
-  unkToken:           '[UNK]',
-  wordpiecesPrefix:   '##'
-};
-
-/**
- * Instantiate and returns a new Bert WordPiece tokenizer
- * @param options 
- */
-export async function getBertWordpieceTokenizer(options?: BertWordpieceOptions): Promise<BertWordpieceTokenizer> {
-  const mergedOptions = { ...defaultBertOptions, ...options };
-
-  let model: Model;
-  if (mergedOptions.vocabFile) {
-    const fromFiles = promisify(models.WordPiece.fromFiles);
-    model = await fromFiles(mergedOptions.vocabFile, mergedOptions.unkToken, null);
-  } else {
-    model = models.WordPiece.empty();
-  }
-
-  const tokenizer = new Tokenizer(model);
-  return new BertWordpieceTokenizer(tokenizer);
+export interface BertWordPieceTrainOptions {
+  /**
+   * @default []
+   */
+  initialAlphabet?:  string[];
+  /**
+   * @default 1000
+   */
+  limitAlphabet?:    number;
+  /**
+   * @default 2
+   */
+  minFrequency?:     number;
+  /**
+   * @default true
+   */
+  showProgress?:     boolean;
+  /**
+   * @default ["[UNK]", "[SEP]", "[CLS]"]
+   */
+  specialTokens?:    string[];
+  /**
+   * @default 30000
+   */
+  vocabSize?:        number;
+  /**
+   * @default "##"
+   */
+  wordpiecesPrefix?: string;
 }

 /**
 * Bert WordPiece Tokenizer
 */
-class BertWordpieceTokenizer extends BaseTokenizer {
-  constructor(tokenizer: Tokenizer) {
+export class BertWordPieceTokenizer extends BaseTokenizer {
+  private static readonly defaultBertOptions:
+    Required<Omit<BertWordPieceOptions, "vocabFile">> & { vocabFile?: string } = {
+    addSpecialTokens:   true,
+    cleanText:          true,
+    clsToken:           "[CLS]",
+    handleChineseChars: true,
+    lowercase:          true,
+    sepToken:           "[SEP]",
+    stripAccents:       true,
+    unkToken:           "[UNK]",
+    wordpiecesPrefix:   "##"
+  };
+
+  private readonly defaultTrainOptions: Required<BertWordPieceTrainOptions> = {
+    initialAlphabet:  [],
+    limitAlphabet:    1000,
+    minFrequency:     2,
+    showProgress:     true,
+    specialTokens:    ['<unk>'],
+    vocabSize:        30000,
+    wordpiecesPrefix: "##"
+  };
+
+  private constructor(tokenizer: Tokenizer) {
    super(tokenizer);
  }
+
+  /**
+   * Instantiate and returns a new Bert WordPiece tokenizer
+   * @param [options] Optional tokenizer options 
+   */
+  static async fromOptions(options?: BertWordPieceOptions): Promise<BertWordPieceTokenizer> {
+    const mergedOptions = { ...this.defaultBertOptions, ...options };
+
+    let model: Model;
+    if (mergedOptions.vocabFile) {
+      // const fromFiles = promisify(WordPiece.fromFiles);
+      model = wordPiece.fromFiles(mergedOptions.vocabFile, { unkToken: mergedOptions.unkToken });
+      // model = await fromFiles(mergedOptions.vocabFile, mergedOptions.unkToken, null);
+    } else {
+      model = wordPiece.empty();
+    }
+
+    const tokenizer = new Tokenizer(model);
+
+    const normalizer = bertNormalizer(mergedOptions);
+    tokenizer.setNormalizer(normalizer);
+    tokenizer.setPreTokenizer(bertPreTokenizer());
+
+    const sepTokenId = tokenizer.tokenToId(mergedOptions.sepToken);
+    if (sepTokenId === undefined) {
+      throw new Error("sepToken not found in the vocabulary");
+    }
+
+    const clsTokenId = tokenizer.tokenToId(mergedOptions.clsToken);
+    if (clsTokenId === undefined) {
+      throw new Error("clsToken not found in the vocabulary");
+    }
+
+    if (mergedOptions.addSpecialTokens) {
+      const processor = bertProcessing([mergedOptions.sepToken, sepTokenId], [mergedOptions.clsToken, clsTokenId]);
+      tokenizer.setPostProcessor(processor);
+    }
+
+    const decoder = wordPieceDecoder(mergedOptions.wordpiecesPrefix);
+    tokenizer.setDecoder(decoder);
+
+    return new BertWordPieceTokenizer(tokenizer);
+  }
+
+  /**
+   * Train the model using the given files
+   *
+   * @param files Files to use for training
+   * @param [options] Training options
+   */
+  async train(files: string[], options?: BertWordPieceTrainOptions): Promise<void> {
+    const mergedOptions = { ...this.defaultTrainOptions, ...options };
+    const trainer = wordPieceTrainer(mergedOptions);
+
+    this.tokenizer.train(trainer, files);
+  }
 }
--- a/bindings/node/lib/implementations/bpe.tokenizer.ts
+++ b/bindings/node/lib/implementations/bpe.tokenizer.ts
@ -1,52 +1,126 @@
 import { promisify } from "util";
 import { BaseTokenizer } from "./base.tokenizer";
-import { Model, models } from "../bindings/models";
+import { Model, bpe } from "../bindings/models";
 import { Tokenizer } from "../bindings/tokenizer";
+import { sequenceNormalizer, nfkcNormalizer, lowercaseNormalizer } from "../bindings/normalizers";
+import { whitespacePreTokenizer } from "../bindings/pre-tokenizers";
+import { bpeDecoder } from "../bindings/decoders";
+import { bpeTrainer } from "../bindings/trainers";

-interface BPEOptions {
+export interface BPETokenizerOptions {
  dropout?:    number;
  mergesFile?: string;
+  /**
+   * @default "</w>"
+   */
  suffix?:     string;
+  /**
+   * @default "<unk>"
+   */
  unkToken?:   string;
  vocabFile?:  string;
 }

-const defaultBPEOptions: BPEOptions & Required<Pick<BPEOptions, 'unkToken' | 'suffix'>> = {
-  suffix:   '</w>',
-  unkToken: '<unk>'
-};
-
-/**
- * Instantiate and returns a new BPE tokenizer
- * @param options 
- */
-export async function getBPETokenizer(options?: BPEOptions): Promise<BPETokenizer> {
-  const mergedOptions = { ...defaultBPEOptions, ...options };
-
-  let model: Model;
-  if (mergedOptions.vocabFile && mergedOptions.mergesFile) {
-    const fromFiles = promisify(models.BPE.fromFiles);
-    const modelOptions: models.BPE.BPEOptions = {
-      dropout:         mergedOptions.dropout,
-      endOfWordSuffix: mergedOptions.suffix,
-      unkToken:        mergedOptions.unkToken
-    };
-
-    model = await fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, modelOptions);
-  } else {
-    model = models.BPE.empty();
-  }
-
-  const tokenizer = new Tokenizer(model);
-  return new BPETokenizer(tokenizer);
+export interface BPETokenizerTrainOptions {
+  /**
+   * @default []
+   */
+  initialAlphabet?: string[];
+  /**
+   * @default 1000
+   */
+  limitAlphabet?:   number;
+  /**
+   * @default 2
+   */
+  minFrequency?:    number;
+  /**
+   * @default true
+   */
+  showProgress?:    boolean;
+  /**
+   * @default ["<unk>"]
+   */
+  specialTokens?:   string[];
+  /**
+   * @default "</w>"
+   */
+  suffix?:          string;
+  /**
+   * @default 30000
+   */
+  vocabSize?:       number;
 }

 /**
 * Original BPE Tokenizer.
 * Represents the BPE algorithm, as introduced by Rico Sennrich (https://arxiv.org/abs/1508.07909)
 */
-class BPETokenizer extends BaseTokenizer {
-  constructor(tokenizer: Tokenizer) {
+export class BPETokenizer extends BaseTokenizer {
+  private static readonly defaultBPEOptions:
+    BPETokenizerOptions & Required<Pick<BPETokenizerOptions, "unkToken" | "suffix">> = {
+    suffix:   "</w>",
+    unkToken: "<unk>"
+  };
+
+  private readonly defaultTrainOptions: Required<BPETokenizerTrainOptions> = {
+    initialAlphabet: [],
+    limitAlphabet:   1000,
+    minFrequency:    2,
+    showProgress:    true,
+    specialTokens:   ["<unk>"],
+    suffix:          "</w>",
+    vocabSize:       30000
+  };
+
+  private constructor(tokenizer: Tokenizer) {
    super(tokenizer);
  }
+
+  /**
+   * Instantiate and returns a new BPE tokenizer
+   * @param [options] Optional tokenizer options
+   */
+  static async fromOptions(options?: BPETokenizerOptions): Promise<BPETokenizer> {
+    const mergedOptions = { ...this.defaultBPEOptions, ...options };
+
+    let model: Model;
+    if (mergedOptions.vocabFile && mergedOptions.mergesFile) {
+      // const fromFiles = promisify(BPE.fromFiles);
+      const modelOptions: bpe.BPEModelOptions = {
+        dropout:         mergedOptions.dropout,
+        endOfWordSuffix: mergedOptions.suffix,
+        unkToken:        mergedOptions.unkToken
+      };
+
+      model = bpe.fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, modelOptions);
+      // model = await fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, modelOptions);
+    } else {
+      model = bpe.empty();
+    }
+  
+    const tokenizer = new Tokenizer(model);
+
+    const normalizer = sequenceNormalizer([nfkcNormalizer(), lowercaseNormalizer()]);
+    tokenizer.setNormalizer(normalizer);
+    tokenizer.setPreTokenizer(whitespacePreTokenizer());
+
+    const decoder = bpeDecoder(mergedOptions.suffix);
+    tokenizer.setDecoder(decoder);
+
+    return new BPETokenizer(tokenizer);
+  }
+
+  /**
+   * Train the model using the given files
+   *
+   * @param files Files to use for training
+   * @param [options] Training options
+   */
+  async train(files: string[], options?: BPETokenizerTrainOptions): Promise<void> {
+    const mergedOptions = { ...this.defaultTrainOptions, ...options };
+    const trainer = bpeTrainer(mergedOptions);
+
+    this.tokenizer.train(trainer, files);
+  }
 }
--- a/bindings/node/lib/implementations/byte-level-bpe.tokenizer.ts
+++ b/bindings/node/lib/implementations/byte-level-bpe.tokenizer.ts
@ -0,0 +1,91 @@
+import { BaseTokenizer } from "./base.tokenizer";
+import { Tokenizer } from "../bindings/tokenizer";
+import { Model, bpe } from "../bindings/models";
+import { nfkcNormalizer } from "../bindings/normalizers";
+import { byteLevelPreTokenizer, byteLevelAlphabet } from "../bindings/pre-tokenizers";
+import { byteLevelDecoder } from "../bindings/decoders";
+import { bpeTrainer } from "../bindings/trainers";
+
+export interface ByteLevelBPETokenizerOptions {
+  /**
+   * @default false
+   */
+  addPrefixSpace?: boolean;
+  mergesFile?:     string;
+  vocabFile?:      string;
+}
+
+export interface ByteLevelBPETrainOptions {
+  /**
+   * @default 2
+   */
+  minFrequency?:  number;
+  /**
+   * @default true
+   */
+  showProgress?:  boolean;
+  /**
+   * @default []
+   */
+  specialTokens?: string[];
+  /**
+   * @default 30000
+   */
+  vocabSize?:     number;
+}
+
+/**
+ * Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
+ */
+export class ByteLevelBPETokenizer extends BaseTokenizer {
+  private static readonly defaultOptions: 
+    ByteLevelBPETokenizerOptions & Required<Pick<ByteLevelBPETokenizerOptions, 'addPrefixSpace'>> = {
+      addPrefixSpace: false
+  };
+
+  private readonly defaultTrainOptions: Required<ByteLevelBPETrainOptions> = {
+    minFrequency:    2,
+    showProgress:    true,
+    specialTokens:   ['<unk>'],
+    vocabSize:       30000
+  };
+
+  private constructor(tokenizer: Tokenizer) {
+    super(tokenizer);
+  }
+
+  static async fromOptions(options?: ByteLevelBPETokenizerOptions): Promise<ByteLevelBPETokenizer> {
+    const mergedOptions = { ...this.defaultOptions, ...options };
+
+    let model: Model;
+    if (mergedOptions.vocabFile && mergedOptions.mergesFile) {
+      // const fromFiles = promisify(BPE.fromFiles);
+      model = bpe.fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile);
+      // model = await fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, null);
+    } else {
+      model = bpe.empty();
+    }
+    
+    const tokenizer = new Tokenizer(model);
+    tokenizer.setNormalizer(nfkcNormalizer());
+
+    const preTokenizer = byteLevelPreTokenizer(mergedOptions.addPrefixSpace);
+    tokenizer.setPreTokenizer(preTokenizer);
+    tokenizer.setDecoder(byteLevelDecoder());
+
+    return new ByteLevelBPETokenizer(tokenizer);
+  }
+
+  /**
+   * Train the model using the given files
+   *
+   * @param files Files to use for training
+   * @param [options] Training options
+   */
+  async train(files: string[], options?: ByteLevelBPETrainOptions): Promise<void> {
+    const mergedOptions = { ...this.defaultTrainOptions, ...options };
+    const trainer = bpeTrainer({ ...mergedOptions, initialAlphabet: byteLevelAlphabet() });
+
+    this.tokenizer.train(trainer, files);
+  }
+}
--- a/bindings/node/lib/implementations/index.ts
+++ b/bindings/node/lib/implementations/index.ts
@ -1,2 +1,4 @@
 export * from './bert-wordpiece.tokenizer';
 export * from './bpe.tokenizer';
+export * from './byte-level-bpe.tokenizer';
+export * from './sentence-piece.tokenizer';
--- a/bindings/node/lib/implementations/sentence-piece.tokenizer.ts
+++ b/bindings/node/lib/implementations/sentence-piece.tokenizer.ts
@ -0,0 +1,121 @@
+import { BaseTokenizer } from "./base.tokenizer";
+import { Tokenizer } from "../bindings/tokenizer";
+import { Model, bpe } from "../bindings/models";
+import { nfkcNormalizer } from "../bindings/normalizers";
+import { metaspacePreTokenizer } from "../bindings/pre-tokenizers";
+import { metaspaceDecoder } from "../bindings/decoders";
+import { bpeTrainer } from "../bindings/trainers";
+
+export interface SentencePieceTokenizerOptions extends OptionsWithDefaults {
+  dropout?:    number;
+  mergesFile?: string;
+  vocabFile?:  string;
+}
+
+interface OptionsWithDefaults {
+  /**
+   * @default true
+   */
+  addPrefixSpace?: boolean;
+  /**
+   * @default "▁"
+   */
+  replacement?:    string;
+  /**
+   * @default "<unk>"
+   */
+  unkToken?:       string;
+}
+
+export interface SentencePieceTrainOptions {
+  /**
+   * @default []
+   */
+  initialAlphabet?: string[];
+  /**
+   * @default 1000
+   */
+  limitAlphabet?:   number;
+  /**
+   * @default 2
+   */
+  minFrequency?:    number;
+  /**
+   * @default true
+   */
+  showProgress?:    boolean;
+  /**
+   * @default ["<unk>"]
+   */
+  specialTokens?:   string[];
+  /**
+   * @default 30000
+   */
+  vocabSize?:       number;
+}
+
+/**
+ * Represents the BPE algorithm, with the pretokenization used by SentencePiece
+ */
+export class SentencePieceTokenizer extends BaseTokenizer {
+  private static readonly defaultOptions: SentencePieceTokenizerOptions & Required<OptionsWithDefaults> = {
+    addPrefixSpace: true,
+    replacement:    '▁',
+    unkToken:       '<unk>'
+  };
+
+  private readonly defaultTrainOptions: Required<SentencePieceTrainOptions> = {
+    initialAlphabet: [],
+    limitAlphabet:   1000,
+    minFrequency:    2,
+    showProgress:    true,
+    specialTokens:   ['<unk>'],
+    vocabSize:       30000
+  };
+
+  private constructor(tokenizer: Tokenizer) {
+    super(tokenizer);
+  }
+
+  static async fromOptions(options?: SentencePieceTokenizerOptions): Promise<SentencePieceTokenizer> {
+    const mergedOptions = { ...this.defaultOptions, ...options };
+
+    let model: Model;
+    if (mergedOptions.vocabFile && mergedOptions.mergesFile) {
+      // const fromFiles = promisify(BPE.fromFiles);
+      const modelOptions: bpe.BPEModelOptions = {
+        dropout:  mergedOptions.dropout,
+        unkToken: mergedOptions.unkToken
+      };
+
+      model = bpe.fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, modelOptions);
+      // model = await fromFiles(mergedOptions.vocabFile, mergedOptions.mergesFile, null);
+    } else {
+      model = bpe.empty();
+    }
+    
+    const tokenizer = new Tokenizer(model);
+    tokenizer.setNormalizer(nfkcNormalizer());
+
+    const preTokenizer = metaspacePreTokenizer(mergedOptions.replacement, mergedOptions.addPrefixSpace);
+    tokenizer.setPreTokenizer(preTokenizer);
+
+    const decoder = metaspaceDecoder(mergedOptions.replacement, mergedOptions.addPrefixSpace);
+    tokenizer.setDecoder(decoder);
+
+    return new SentencePieceTokenizer(tokenizer);
+  }
+
+  /**
+   * Train the model using the given files
+   *
+   * @param files Files to use for training
+   * @param [options] Training options
+   */
+  async train(files: string[], options?: SentencePieceTrainOptions): Promise<void> {
+    const mergedOptions = { ...this.defaultTrainOptions, ...options };
+    const trainer = bpeTrainer(mergedOptions);
+
+    this.tokenizer.train(trainer, files);
+  }
+}
--- a/bindings/node/lib/index.d.ts
+++ b/bindings/node/lib/index.d.ts
@ -1,2 +1 @@
-export * from './bindings';
 export * from './implementations';
--- a/bindings/node/lib/index.js
+++ b/bindings/node/lib/index.js
@ -3,5 +3,5 @@ function __export(m) {
    for (var p in m) if (!exports.hasOwnProperty(p)) exports[p] = m[p];
 }
 Object.defineProperty(exports, "__esModule", { value: true });
-__export(require("./bindings"));
+// export * from './bindings';
 __export(require("./implementations"));
--- a/bindings/node/lib/index.ts
+++ b/bindings/node/lib/index.ts
@ -1,2 +1,2 @@
-export * from './bindings';
+// export * from './bindings';
 export * from './implementations';