mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
add more methods on tokenizer
This commit is contained in:
96
bindings/node/lib/bindings/tokenizer.d.ts
vendored
96
bindings/node/lib/bindings/tokenizer.d.ts
vendored
@ -1,4 +1,5 @@
|
|||||||
import { Model } from "./models";
|
import { Model } from "./models";
|
||||||
|
import { Decoder } from "./decoders";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A Tokenizer works as a pipeline, it processes some raw text as input and outputs
|
* A Tokenizer works as a pipeline, it processes some raw text as input and outputs
|
||||||
@ -18,25 +19,86 @@ export class Tokenizer {
|
|||||||
* Instantiate a new Tokenizer using the given Model
|
* Instantiate a new Tokenizer using the given Model
|
||||||
*/
|
*/
|
||||||
constructor(model: Model);
|
constructor(model: Model);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add the given tokens to the vocabulary
|
||||||
|
*
|
||||||
|
* @param tokens A list of tokens to add to the vocabulary.
|
||||||
|
* Each token can either be a string, or a tuple with a string representing the token,
|
||||||
|
* and a boolean option representing whether to match on single words only.
|
||||||
|
* If the boolean is not included, it defaults to False
|
||||||
|
* @returns The number of tokens that were added to the vocabulary
|
||||||
|
*/
|
||||||
|
addTokens(tokens: (string | [string, boolean])[]): number;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add the given special tokens to the vocabulary, and treat them as special tokens.
|
||||||
|
* The special tokens will never be processed by the model, and will be removed while decoding.
|
||||||
|
*
|
||||||
|
* @param tokens The list of special tokens to add
|
||||||
|
* @returns {number} The number of tokens that were added to the vocabulary
|
||||||
|
*/
|
||||||
|
addSpecialTokens(tokens: string[]): number;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Encode the given sequence
|
* Encode the given sequence
|
||||||
*
|
*
|
||||||
* @param {string} sequence The sequence to encode
|
* @param sequence The sequence to encode
|
||||||
* @param {(string | null)} pair The optional pair sequence
|
* @param pair The optional pair sequence
|
||||||
* @param {(err: any, encoding: Encoding) => void} __callback Callback called when encoding is complete
|
* @param __callback Callback called when encoding is complete
|
||||||
*/
|
*/
|
||||||
encode(sequence: string, pair: string | null, __callback: (err: any, encoding: Encoding) => void): void;
|
encode(sequence: string, pair: string | null, __callback: (err: any, encoding: Encoding) => void): void;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Encode the given sequences or pair of sequences
|
* Encode the given sequences or pair of sequences
|
||||||
*
|
*
|
||||||
* @param {((string | [string, string])[])} sequences A list of sequences or pair of sequences.
|
* @param sequences A list of sequences or pair of sequences. The list can contain both at the same time.
|
||||||
* The list can contain both at the same time.
|
* @param __callback Callback called when encoding is complete
|
||||||
* @param {(err: any, encodings: Encoding[]) => void} __callback Callback called when encoding is complete
|
|
||||||
*/
|
*/
|
||||||
encodeBatch(sequences: (string | [string, string])[], __callback: (err: any, encodings: Encoding[]) => void): void;
|
encodeBatch(sequences: (string | [string, string])[], __callback: (err: any, encodings: Encoding[]) => void): void;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Decode the given list of ids to a string sequence
|
||||||
|
*
|
||||||
|
* @param ids A list of ids to be decoded
|
||||||
|
* @param {boolean} [skipSpecialTokens=true] Whether to remove all the special tokens from the output string
|
||||||
|
* @returns The decoded string
|
||||||
|
*/
|
||||||
|
decode(ids: number[], skipSpecialTokens?: boolean): string;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Decode the list of sequences to a list of string sequences
|
||||||
|
*
|
||||||
|
* @param sequences A list of sequence of ids to be decoded
|
||||||
|
* @param {boolean} [skipSpecialTokens] Whether to remove all the special tokens from the output strings
|
||||||
|
* @returns A list of decoded strings
|
||||||
|
*/
|
||||||
|
decodeBatch(sequences: number[][], skipSpecialTokens?: boolean): string[];
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert the given token id to its corresponding string
|
||||||
|
*
|
||||||
|
* @param id The token id to convert
|
||||||
|
* @returns The corresponding string if it exists
|
||||||
|
*/
|
||||||
|
idToToken(id: number): string | undefined;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert the given token to its corresponding id
|
||||||
|
*
|
||||||
|
* @param token The token to convert
|
||||||
|
* @returns The corresponding id if it exists
|
||||||
|
*/
|
||||||
|
tokenToId(token: string): number | undefined;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Train the model using the given files
|
||||||
|
*
|
||||||
|
* @param trainer Trainer to use
|
||||||
|
* @param files List of files to use
|
||||||
|
*/
|
||||||
|
train(trainer: Trainer, files: string[]): void;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the size of the vocabulary
|
* Returns the size of the vocabulary
|
||||||
*
|
*
|
||||||
@ -49,12 +111,30 @@ export class Tokenizer {
|
|||||||
*/
|
*/
|
||||||
runningTasks(): number;
|
runningTasks(): number;
|
||||||
|
|
||||||
|
getModel(): Model;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Change the model to use with this Tokenizer
|
* Change the model to use with this Tokenizer
|
||||||
* @param model New model to use
|
* @param model New model to use
|
||||||
* @throws Will throw an error if any task is running
|
* @throws Will throw an error if any task is running
|
||||||
*/
|
*/
|
||||||
setModel(model: Model): void;
|
setModel(model: Model): void;
|
||||||
|
|
||||||
|
getNormalizer(): Normalizer;
|
||||||
|
|
||||||
|
setNormalizer(normalizer: Normalizer): void;
|
||||||
|
|
||||||
|
getPreTokenizer(): PreTokenizer;
|
||||||
|
|
||||||
|
setPreTokenizer(preTokenizer: PreTokenizer): void;
|
||||||
|
|
||||||
|
getPostProcessor(): PostProcessor;
|
||||||
|
|
||||||
|
setPostProcessor(processor: PostProcessor): void;
|
||||||
|
|
||||||
|
getDecoder(): Decoder;
|
||||||
|
|
||||||
|
setDecoder(decoder: Decoder): void;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -99,7 +179,7 @@ declare class Encoding {
|
|||||||
/**
|
/**
|
||||||
* Pad the current Encoding at the given length
|
* Pad the current Encoding at the given length
|
||||||
*
|
*
|
||||||
* @param {number} length The length at which to pad
|
* @param length The length at which to pad
|
||||||
* @param {PaddingOptions} [options] Padding options
|
* @param {PaddingOptions} [options] Padding options
|
||||||
*/
|
*/
|
||||||
pad(length: number, options?: PaddingOptions): void;
|
pad(length: number, options?: PaddingOptions): void;
|
||||||
@ -107,7 +187,7 @@ declare class Encoding {
|
|||||||
/**
|
/**
|
||||||
* Truncate the current Encoding at the given max_length
|
* Truncate the current Encoding at the given max_length
|
||||||
*
|
*
|
||||||
* @param {number} length The maximum length to be kept
|
* @param length The maximum length to be kept
|
||||||
* @param {number} [stride=0] The length of the previous first sequence
|
* @param {number} [stride=0] The length of the previous first sequence
|
||||||
* to be includedin the overflowing sequence
|
* to be includedin the overflowing sequence
|
||||||
*/
|
*/
|
||||||
|
Reference in New Issue
Block a user