mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
node: expose Encoding
This commit is contained in:
89
bindings/node/lib/bindings/encoding.d.ts
vendored
Normal file
89
bindings/node/lib/bindings/encoding.d.ts
vendored
Normal file
@ -0,0 +1,89 @@
|
||||
/**
|
||||
* An Encoding as returned by the Tokenizer
|
||||
*/
|
||||
export interface Encoding {
|
||||
/**
|
||||
* Returns the attention mask
|
||||
*/
|
||||
getAttentionMask(): number[];
|
||||
|
||||
/**
|
||||
* Returns the tokenized ids
|
||||
*/
|
||||
getIds(): number[];
|
||||
|
||||
/**
|
||||
* Returns the offsets
|
||||
*/
|
||||
getOffsets(): [number, number][];
|
||||
|
||||
/**
|
||||
* Returns the overflowing encoding, after truncation
|
||||
*/
|
||||
getOverflowing(): Encoding | undefined;
|
||||
|
||||
/**
|
||||
* Returns the special tokens mask
|
||||
*/
|
||||
getSpecialTokensMask(): number;
|
||||
|
||||
/**
|
||||
* Returns the tokenized string
|
||||
*/
|
||||
getTokens(): string[];
|
||||
|
||||
/**
|
||||
* Returns the type ids
|
||||
*/
|
||||
getTypeIds(): number[];
|
||||
|
||||
/**
|
||||
* Returns the original string
|
||||
*
|
||||
* @param [begin] The index from which to start (can be negative).
|
||||
* @param [end] The index (excluded) to which to stop (can be negative).
|
||||
* Stopping at the end of the string if not provided.
|
||||
* @returns The full original string if no parameter is provided,
|
||||
* otherwise the original string between `begin` and `end`
|
||||
*/
|
||||
getOriginalString(begin?: number, end?: number): string;
|
||||
|
||||
/**
|
||||
* Pad the current Encoding at the given length
|
||||
*
|
||||
* @param length The length at which to pad
|
||||
* @param [options] Padding options
|
||||
*/
|
||||
pad(length: number, options?: PaddingOptions): void;
|
||||
|
||||
/**
|
||||
* Truncate the current Encoding at the given max_length
|
||||
*
|
||||
* @param length The maximum length to be kept
|
||||
* @param [stride=0] The length of the previous first sequence
|
||||
* to be included in the overflowing sequence
|
||||
*/
|
||||
truncate(length: number, stride?: number): void;
|
||||
}
|
||||
|
||||
interface PaddingOptions {
|
||||
/**
|
||||
* @default "right"
|
||||
*/
|
||||
direction?: "left" | "right";
|
||||
/**
|
||||
* The index to be used when padding
|
||||
* @default 0
|
||||
*/
|
||||
padId?: number;
|
||||
/**
|
||||
* The type index to be used when padding
|
||||
* @default 0
|
||||
*/
|
||||
padTypeId?: number;
|
||||
/**
|
||||
* The pad token to be used when padding
|
||||
* @default "[PAD]"
|
||||
*/
|
||||
padToken?: string;
|
||||
}
|
@ -1,7 +1,8 @@
|
||||
import { promisify } from "util";
|
||||
|
||||
import { Encoding } from "./encoding";
|
||||
import { BPE } from "./models";
|
||||
import { Encoding, Tokenizer } from "./tokenizer";
|
||||
import { Tokenizer } from "./tokenizer";
|
||||
|
||||
describe("Encoding", () => {
|
||||
const originalString = "my name is john";
|
||||
|
91
bindings/node/lib/bindings/tokenizer.d.ts
vendored
91
bindings/node/lib/bindings/tokenizer.d.ts
vendored
@ -1,4 +1,5 @@
|
||||
import { Decoder } from "./decoders";
|
||||
import { Encoding } from "./encoding";
|
||||
import { Model } from "./models";
|
||||
import { Normalizer } from "./normalizers";
|
||||
import { PostProcessor } from "./post-processors";
|
||||
@ -187,93 +188,3 @@ export class Tokenizer {
|
||||
*/
|
||||
setDecoder(decoder: Decoder): void;
|
||||
}
|
||||
|
||||
/**
|
||||
* An Encoding as returned by the Tokenizer
|
||||
*/
|
||||
interface Encoding {
|
||||
/**
|
||||
* Returns the attention mask
|
||||
*/
|
||||
getAttentionMask(): number[];
|
||||
|
||||
/**
|
||||
* Returns the tokenized ids
|
||||
*/
|
||||
getIds(): number[];
|
||||
|
||||
/**
|
||||
* Returns the offsets
|
||||
*/
|
||||
getOffsets(): [number, number][];
|
||||
|
||||
/**
|
||||
* Returns the overflowing encoding, after truncation
|
||||
*/
|
||||
getOverflowing(): Encoding | undefined;
|
||||
|
||||
/**
|
||||
* Returns the special tokens mask
|
||||
*/
|
||||
getSpecialTokensMask(): number;
|
||||
|
||||
/**
|
||||
* Returns the tokenized string
|
||||
*/
|
||||
getTokens(): string[];
|
||||
|
||||
/**
|
||||
* Returns the type ids
|
||||
*/
|
||||
getTypeIds(): number[];
|
||||
|
||||
/**
|
||||
* Returns the original string
|
||||
*
|
||||
* @param [begin] The index from which to start (can be negative).
|
||||
* @param [end] The index (excluded) to which to stop (can be negative).
|
||||
* Stopping at the end of the string if not provided.
|
||||
* @returns The full original string if no parameter is provided,
|
||||
* otherwise the original string between `begin` and `end`
|
||||
*/
|
||||
getOriginalString(begin?: number, end?: number): string;
|
||||
|
||||
/**
|
||||
* Pad the current Encoding at the given length
|
||||
*
|
||||
* @param length The length at which to pad
|
||||
* @param [options] Padding options
|
||||
*/
|
||||
pad(length: number, options?: PaddingOptions): void;
|
||||
|
||||
/**
|
||||
* Truncate the current Encoding at the given max_length
|
||||
*
|
||||
* @param length The maximum length to be kept
|
||||
* @param [stride=0] The length of the previous first sequence
|
||||
* to be included in the overflowing sequence
|
||||
*/
|
||||
truncate(length: number, stride?: number): void;
|
||||
}
|
||||
|
||||
interface PaddingOptions {
|
||||
/**
|
||||
* @default "right"
|
||||
*/
|
||||
direction?: "left" | "right";
|
||||
/**
|
||||
* The index to be used when padding
|
||||
* @default 0
|
||||
*/
|
||||
padId?: number;
|
||||
/**
|
||||
* The type index to be used when padding
|
||||
* @default 0
|
||||
*/
|
||||
padTypeId?: number;
|
||||
/**
|
||||
* The pad token to be used when padding
|
||||
* @default "[PAD]"
|
||||
*/
|
||||
padToken?: string;
|
||||
}
|
||||
|
@ -1,2 +1,2 @@
|
||||
// export * from './bindings';
|
||||
// export * from "./bindings";
|
||||
export * from "./tokenizers";
|
||||
|
@ -1,6 +1,9 @@
|
||||
import { promisify } from "util";
|
||||
|
||||
import { Encoding, Tokenizer } from "../bindings/tokenizer";
|
||||
import { Encoding } from "../bindings/encoding";
|
||||
import { Tokenizer } from "../bindings/tokenizer";
|
||||
|
||||
export { Encoding };
|
||||
|
||||
export class BaseTokenizer {
|
||||
constructor(protected tokenizer: Tokenizer) {}
|
||||
|
@ -1,3 +1,4 @@
|
||||
export { Encoding } from "./base.tokenizer";
|
||||
export * from "./bert-wordpiece.tokenizer";
|
||||
export * from "./bpe.tokenizer";
|
||||
export * from "./byte-level-bpe.tokenizer";
|
||||
|
Reference in New Issue
Block a user