node: expose Encoding

This commit is contained in:
Pierric Cistac
2020-02-03 11:30:50 -05:00
parent 27880b3aaf
commit 19878e7584
6 changed files with 98 additions and 93 deletions

View File

@ -0,0 +1,89 @@
/**
* An Encoding as returned by the Tokenizer
*/
export interface Encoding {
/**
* Returns the attention mask
*/
getAttentionMask(): number[];
/**
* Returns the tokenized ids
*/
getIds(): number[];
/**
* Returns the offsets
*/
getOffsets(): [number, number][];
/**
* Returns the overflowing encoding, after truncation
*/
getOverflowing(): Encoding | undefined;
/**
* Returns the special tokens mask
*/
getSpecialTokensMask(): number;
/**
* Returns the tokenized string
*/
getTokens(): string[];
/**
* Returns the type ids
*/
getTypeIds(): number[];
/**
* Returns the original string
*
* @param [begin] The index from which to start (can be negative).
* @param [end] The index (excluded) to which to stop (can be negative).
* Stopping at the end of the string if not provided.
* @returns The full original string if no parameter is provided,
* otherwise the original string between `begin` and `end`
*/
getOriginalString(begin?: number, end?: number): string;
/**
* Pad the current Encoding at the given length
*
* @param length The length at which to pad
* @param [options] Padding options
*/
pad(length: number, options?: PaddingOptions): void;
/**
* Truncate the current Encoding at the given max_length
*
* @param length The maximum length to be kept
* @param [stride=0] The length of the previous first sequence
* to be included in the overflowing sequence
*/
truncate(length: number, stride?: number): void;
}
interface PaddingOptions {
/**
* @default "right"
*/
direction?: "left" | "right";
/**
* The index to be used when padding
* @default 0
*/
padId?: number;
/**
* The type index to be used when padding
* @default 0
*/
padTypeId?: number;
/**
* The pad token to be used when padding
* @default "[PAD]"
*/
padToken?: string;
}

View File

@ -1,7 +1,8 @@
import { promisify } from "util";
import { Encoding } from "./encoding";
import { BPE } from "./models";
import { Encoding, Tokenizer } from "./tokenizer";
import { Tokenizer } from "./tokenizer";
describe("Encoding", () => {
const originalString = "my name is john";

View File

@ -1,4 +1,5 @@
import { Decoder } from "./decoders";
import { Encoding } from "./encoding";
import { Model } from "./models";
import { Normalizer } from "./normalizers";
import { PostProcessor } from "./post-processors";
@ -187,93 +188,3 @@ export class Tokenizer {
*/
setDecoder(decoder: Decoder): void;
}
/**
* An Encoding as returned by the Tokenizer
*/
interface Encoding {
/**
* Returns the attention mask
*/
getAttentionMask(): number[];
/**
* Returns the tokenized ids
*/
getIds(): number[];
/**
* Returns the offsets
*/
getOffsets(): [number, number][];
/**
* Returns the overflowing encoding, after truncation
*/
getOverflowing(): Encoding | undefined;
/**
* Returns the special tokens mask
*/
getSpecialTokensMask(): number;
/**
* Returns the tokenized string
*/
getTokens(): string[];
/**
* Returns the type ids
*/
getTypeIds(): number[];
/**
* Returns the original string
*
* @param [begin] The index from which to start (can be negative).
* @param [end] The index (excluded) to which to stop (can be negative).
* Stopping at the end of the string if not provided.
* @returns The full original string if no parameter is provided,
* otherwise the original string between `begin` and `end`
*/
getOriginalString(begin?: number, end?: number): string;
/**
* Pad the current Encoding at the given length
*
* @param length The length at which to pad
* @param [options] Padding options
*/
pad(length: number, options?: PaddingOptions): void;
/**
* Truncate the current Encoding at the given max_length
*
* @param length The maximum length to be kept
* @param [stride=0] The length of the previous first sequence
* to be included in the overflowing sequence
*/
truncate(length: number, stride?: number): void;
}
interface PaddingOptions {
/**
* @default "right"
*/
direction?: "left" | "right";
/**
* The index to be used when padding
* @default 0
*/
padId?: number;
/**
* The type index to be used when padding
* @default 0
*/
padTypeId?: number;
/**
* The pad token to be used when padding
* @default "[PAD]"
*/
padToken?: string;
}

View File

@ -1,2 +1,2 @@
// export * from './bindings';
// export * from "./bindings";
export * from "./tokenizers";

View File

@ -1,6 +1,9 @@
import { promisify } from "util";
import { Encoding, Tokenizer } from "../bindings/tokenizer";
import { Encoding } from "../bindings/encoding";
import { Tokenizer } from "../bindings/tokenizer";
export { Encoding };
export class BaseTokenizer {
constructor(protected tokenizer: Tokenizer) {}

View File

@ -1,3 +1,4 @@
export { Encoding } from "./base.tokenizer";
export * from "./bert-wordpiece.tokenizer";
export * from "./bpe.tokenizer";
export * from "./byte-level-bpe.tokenizer";