node: expose Encoding

2025-08-22 16:25:30 +00:00 · 2020-02-03 11:30:50 -05:00
parent 27880b3aaf
commit 19878e7584
6 changed files with 98 additions and 93 deletions
--- a/bindings/node/lib/bindings/encoding.d.ts
+++ b/bindings/node/lib/bindings/encoding.d.ts
@ -0,0 +1,89 @@
+/**
+ * An Encoding as returned by the Tokenizer
+ */
+export interface Encoding {
+  /**
+   * Returns the attention mask
+   */
+  getAttentionMask(): number[];
+
+  /**
+   * Returns the tokenized ids
+   */
+  getIds(): number[];
+
+  /**
+   * Returns the offsets
+   */
+  getOffsets(): [number, number][];
+
+  /**
+   * Returns the overflowing encoding, after truncation
+   */
+  getOverflowing(): Encoding | undefined;
+
+  /**
+   * Returns the special tokens mask
+   */
+  getSpecialTokensMask(): number;
+
+  /**
+   * Returns the tokenized string
+   */
+  getTokens(): string[];
+
+  /**
+   * Returns the type ids
+   */
+  getTypeIds(): number[];
+
+  /**
+   * Returns the original string
+   *
+   * @param [begin] The index from which to start (can be negative).
+   * @param [end] The index (excluded) to which to stop (can be negative).
+   * Stopping at the end of the string if not provided.
+   * @returns The full original string if no parameter is provided,
+   * otherwise the original string between `begin` and `end`
+   */
+  getOriginalString(begin?: number, end?: number): string;
+
+  /**
+   * Pad the current Encoding at the given length
+   *
+   * @param length The length at which to pad
+   * @param [options] Padding options
+   */
+  pad(length: number, options?: PaddingOptions): void;
+
+  /**
+   * Truncate the current Encoding at the given max_length
+   *
+   * @param length The maximum length to be kept
+   * @param [stride=0] The length of the previous first sequence
+   * to be included in the overflowing sequence
+   */
+  truncate(length: number, stride?: number): void;
+}
+
+interface PaddingOptions {
+  /**
+   * @default "right"
+   */
+  direction?: "left" | "right";
+  /**
+   * The index to be used when padding
+   * @default 0
+   */
+  padId?: number;
+  /**
+   * The type index to be used when padding
+   * @default 0
+   */
+  padTypeId?: number;
+  /**
+   * The pad token to be used when padding
+   * @default "[PAD]"
+   */
+  padToken?: string;
+}
--- a/bindings/node/lib/bindings/encoding.test.ts
+++ b/bindings/node/lib/bindings/encoding.test.ts
@ -1,7 +1,8 @@
 import { promisify } from "util";

+import { Encoding } from "./encoding";
 import { BPE } from "./models";
-import { Encoding, Tokenizer } from "./tokenizer";
+import { Tokenizer } from "./tokenizer";

 describe("Encoding", () => {
  const originalString = "my name is john";
--- a/bindings/node/lib/bindings/tokenizer.d.ts
+++ b/bindings/node/lib/bindings/tokenizer.d.ts
@ -1,4 +1,5 @@
 import { Decoder } from "./decoders";
+import { Encoding } from "./encoding";
 import { Model } from "./models";
 import { Normalizer } from "./normalizers";
 import { PostProcessor } from "./post-processors";
@ -187,93 +188,3 @@ export class Tokenizer {
   */
  setDecoder(decoder: Decoder): void;
 }
-
-/**
- * An Encoding as returned by the Tokenizer
- */
-interface Encoding {
-  /**
-   * Returns the attention mask
-   */
-  getAttentionMask(): number[];
-
-  /**
-   * Returns the tokenized ids
-   */
-  getIds(): number[];
-
-  /**
-   * Returns the offsets
-   */
-  getOffsets(): [number, number][];
-
-  /**
-   * Returns the overflowing encoding, after truncation
-   */
-  getOverflowing(): Encoding | undefined;
-
-  /**
-   * Returns the special tokens mask
-   */
-  getSpecialTokensMask(): number;
-
-  /**
-   * Returns the tokenized string
-   */
-  getTokens(): string[];
-
-  /**
-   * Returns the type ids
-   */
-  getTypeIds(): number[];
-
-  /**
-   * Returns the original string
-   *
-   * @param [begin] The index from which to start (can be negative).
-   * @param [end] The index (excluded) to which to stop (can be negative).
-   * Stopping at the end of the string if not provided.
-   * @returns The full original string if no parameter is provided,
-   * otherwise the original string between `begin` and `end`
-   */
-  getOriginalString(begin?: number, end?: number): string;
-
-  /**
-   * Pad the current Encoding at the given length
-   *
-   * @param length The length at which to pad
-   * @param [options] Padding options
-   */
-  pad(length: number, options?: PaddingOptions): void;
-
-  /**
-   * Truncate the current Encoding at the given max_length
-   *
-   * @param length The maximum length to be kept
-   * @param [stride=0] The length of the previous first sequence
-   * to be included in the overflowing sequence
-   */
-  truncate(length: number, stride?: number): void;
-}
-
-interface PaddingOptions {
-  /**
-   * @default "right"
-   */
-  direction?: "left" | "right";
-  /**
-   * The index to be used when padding
-   * @default 0
-   */
-  padId?: number;
-  /**
-   * The type index to be used when padding
-   * @default 0
-   */
-  padTypeId?: number;
-  /**
-   * The pad token to be used when padding
-   * @default "[PAD]"
-   */
-  padToken?: string;
-}
--- a/bindings/node/lib/index.ts
+++ b/bindings/node/lib/index.ts
@ -1,2 +1,2 @@
-// export * from './bindings';
+// export * from "./bindings";
 export * from "./tokenizers";
--- a/bindings/node/lib/tokenizers/base.tokenizer.ts
+++ b/bindings/node/lib/tokenizers/base.tokenizer.ts
@ -1,6 +1,9 @@
 import { promisify } from "util";

-import { Encoding, Tokenizer } from "../bindings/tokenizer";
+import { Encoding } from "../bindings/encoding";
+import { Tokenizer } from "../bindings/tokenizer";
+
+export { Encoding };

 export class BaseTokenizer {
  constructor(protected tokenizer: Tokenizer) {}
--- a/bindings/node/lib/tokenizers/index.ts
+++ b/bindings/node/lib/tokenizers/index.ts
@ -1,3 +1,4 @@
+export { Encoding } from "./base.tokenizer";
 export * from "./bert-wordpiece.tokenizer";
 export * from "./bpe.tokenizer";
 export * from "./byte-level-bpe.tokenizer";