mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
add more methods on tokenizer
This commit is contained in:
96
bindings/node/lib/bindings/tokenizer.d.ts
vendored
96
bindings/node/lib/bindings/tokenizer.d.ts
vendored
@ -1,4 +1,5 @@
|
||||
import { Model } from "./models";
|
||||
import { Decoder } from "./decoders";
|
||||
|
||||
/**
|
||||
* A Tokenizer works as a pipeline, it processes some raw text as input and outputs
|
||||
@ -18,25 +19,86 @@ export class Tokenizer {
|
||||
* Instantiate a new Tokenizer using the given Model
|
||||
*/
|
||||
constructor(model: Model);
|
||||
|
||||
/**
|
||||
* Add the given tokens to the vocabulary
|
||||
*
|
||||
* @param tokens A list of tokens to add to the vocabulary.
|
||||
* Each token can either be a string, or a tuple with a string representing the token,
|
||||
* and a boolean option representing whether to match on single words only.
|
||||
* If the boolean is not included, it defaults to False
|
||||
* @returns The number of tokens that were added to the vocabulary
|
||||
*/
|
||||
addTokens(tokens: (string | [string, boolean])[]): number;
|
||||
|
||||
/**
|
||||
* Add the given special tokens to the vocabulary, and treat them as special tokens.
|
||||
* The special tokens will never be processed by the model, and will be removed while decoding.
|
||||
*
|
||||
* @param tokens The list of special tokens to add
|
||||
* @returns {number} The number of tokens that were added to the vocabulary
|
||||
*/
|
||||
addSpecialTokens(tokens: string[]): number;
|
||||
|
||||
/**
|
||||
* Encode the given sequence
|
||||
*
|
||||
* @param {string} sequence The sequence to encode
|
||||
* @param {(string | null)} pair The optional pair sequence
|
||||
* @param {(err: any, encoding: Encoding) => void} __callback Callback called when encoding is complete
|
||||
* @param sequence The sequence to encode
|
||||
* @param pair The optional pair sequence
|
||||
* @param __callback Callback called when encoding is complete
|
||||
*/
|
||||
encode(sequence: string, pair: string | null, __callback: (err: any, encoding: Encoding) => void): void;
|
||||
|
||||
/**
|
||||
* Encode the given sequences or pair of sequences
|
||||
*
|
||||
* @param {((string | [string, string])[])} sequences A list of sequences or pair of sequences.
|
||||
* The list can contain both at the same time.
|
||||
* @param {(err: any, encodings: Encoding[]) => void} __callback Callback called when encoding is complete
|
||||
* @param sequences A list of sequences or pair of sequences. The list can contain both at the same time.
|
||||
* @param __callback Callback called when encoding is complete
|
||||
*/
|
||||
encodeBatch(sequences: (string | [string, string])[], __callback: (err: any, encodings: Encoding[]) => void): void;
|
||||
|
||||
/**
|
||||
* Decode the given list of ids to a string sequence
|
||||
*
|
||||
* @param ids A list of ids to be decoded
|
||||
* @param {boolean} [skipSpecialTokens=true] Whether to remove all the special tokens from the output string
|
||||
* @returns The decoded string
|
||||
*/
|
||||
decode(ids: number[], skipSpecialTokens?: boolean): string;
|
||||
|
||||
/**
|
||||
* Decode the list of sequences to a list of string sequences
|
||||
*
|
||||
* @param sequences A list of sequence of ids to be decoded
|
||||
* @param {boolean} [skipSpecialTokens] Whether to remove all the special tokens from the output strings
|
||||
* @returns A list of decoded strings
|
||||
*/
|
||||
decodeBatch(sequences: number[][], skipSpecialTokens?: boolean): string[];
|
||||
|
||||
/**
|
||||
* Convert the given token id to its corresponding string
|
||||
*
|
||||
* @param id The token id to convert
|
||||
* @returns The corresponding string if it exists
|
||||
*/
|
||||
idToToken(id: number): string | undefined;
|
||||
|
||||
/**
|
||||
* Convert the given token to its corresponding id
|
||||
*
|
||||
* @param token The token to convert
|
||||
* @returns The corresponding id if it exists
|
||||
*/
|
||||
tokenToId(token: string): number | undefined;
|
||||
|
||||
/**
|
||||
* Train the model using the given files
|
||||
*
|
||||
* @param trainer Trainer to use
|
||||
* @param files List of files to use
|
||||
*/
|
||||
train(trainer: Trainer, files: string[]): void;
|
||||
|
||||
/**
|
||||
* Returns the size of the vocabulary
|
||||
*
|
||||
@ -49,12 +111,30 @@ export class Tokenizer {
|
||||
*/
|
||||
runningTasks(): number;
|
||||
|
||||
getModel(): Model;
|
||||
|
||||
/**
|
||||
* Change the model to use with this Tokenizer
|
||||
* @param model New model to use
|
||||
* @throws Will throw an error if any task is running
|
||||
*/
|
||||
setModel(model: Model): void;
|
||||
|
||||
getNormalizer(): Normalizer;
|
||||
|
||||
setNormalizer(normalizer: Normalizer): void;
|
||||
|
||||
getPreTokenizer(): PreTokenizer;
|
||||
|
||||
setPreTokenizer(preTokenizer: PreTokenizer): void;
|
||||
|
||||
getPostProcessor(): PostProcessor;
|
||||
|
||||
setPostProcessor(processor: PostProcessor): void;
|
||||
|
||||
getDecoder(): Decoder;
|
||||
|
||||
setDecoder(decoder: Decoder): void;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -99,7 +179,7 @@ declare class Encoding {
|
||||
/**
|
||||
* Pad the current Encoding at the given length
|
||||
*
|
||||
* @param {number} length The length at which to pad
|
||||
* @param length The length at which to pad
|
||||
* @param {PaddingOptions} [options] Padding options
|
||||
*/
|
||||
pad(length: number, options?: PaddingOptions): void;
|
||||
@ -107,7 +187,7 @@ declare class Encoding {
|
||||
/**
|
||||
* Truncate the current Encoding at the given max_length
|
||||
*
|
||||
* @param {number} length The maximum length to be kept
|
||||
* @param length The maximum length to be kept
|
||||
* @param {number} [stride=0] The length of the previous first sequence
|
||||
* to be includedin the overflowing sequence
|
||||
*/
|
||||
|
Reference in New Issue
Block a user