add more methods on tokenizer

This commit is contained in:
Pierric Cistac
2020-01-09 18:06:01 -05:00
parent 66d65595f6
commit 969d994f70

View File

@ -1,4 +1,5 @@
import { Model } from "./models";
import { Decoder } from "./decoders";
/**
* A Tokenizer works as a pipeline, it processes some raw text as input and outputs
@ -18,25 +19,86 @@ export class Tokenizer {
* Instantiate a new Tokenizer using the given Model
*/
constructor(model: Model);
/**
* Add the given tokens to the vocabulary
*
* @param tokens A list of tokens to add to the vocabulary.
* Each token can either be a string, or a tuple with a string representing the token,
* and a boolean option representing whether to match on single words only.
* If the boolean is not included, it defaults to False
* @returns The number of tokens that were added to the vocabulary
*/
addTokens(tokens: (string | [string, boolean])[]): number;
/**
* Add the given special tokens to the vocabulary, and treat them as special tokens.
* The special tokens will never be processed by the model, and will be removed while decoding.
*
* @param tokens The list of special tokens to add
* @returns {number} The number of tokens that were added to the vocabulary
*/
addSpecialTokens(tokens: string[]): number;
/**
* Encode the given sequence
*
* @param {string} sequence The sequence to encode
* @param {(string | null)} pair The optional pair sequence
* @param {(err: any, encoding: Encoding) => void} __callback Callback called when encoding is complete
* @param sequence The sequence to encode
* @param pair The optional pair sequence
* @param __callback Callback called when encoding is complete
*/
encode(sequence: string, pair: string | null, __callback: (err: any, encoding: Encoding) => void): void;
/**
* Encode the given sequences or pair of sequences
*
* @param {((string | [string, string])[])} sequences A list of sequences or pair of sequences.
* The list can contain both at the same time.
* @param {(err: any, encodings: Encoding[]) => void} __callback Callback called when encoding is complete
* @param sequences A list of sequences or pair of sequences. The list can contain both at the same time.
* @param __callback Callback called when encoding is complete
*/
encodeBatch(sequences: (string | [string, string])[], __callback: (err: any, encodings: Encoding[]) => void): void;
/**
* Decode the given list of ids to a string sequence
*
* @param ids A list of ids to be decoded
* @param {boolean} [skipSpecialTokens=true] Whether to remove all the special tokens from the output string
* @returns The decoded string
*/
decode(ids: number[], skipSpecialTokens?: boolean): string;
/**
* Decode the list of sequences to a list of string sequences
*
* @param sequences A list of sequence of ids to be decoded
* @param {boolean} [skipSpecialTokens] Whether to remove all the special tokens from the output strings
* @returns A list of decoded strings
*/
decodeBatch(sequences: number[][], skipSpecialTokens?: boolean): string[];
/**
* Convert the given token id to its corresponding string
*
* @param id The token id to convert
* @returns The corresponding string if it exists
*/
idToToken(id: number): string | undefined;
/**
* Convert the given token to its corresponding id
*
* @param token The token to convert
* @returns The corresponding id if it exists
*/
tokenToId(token: string): number | undefined;
/**
* Train the model using the given files
*
* @param trainer Trainer to use
* @param files List of files to use
*/
train(trainer: Trainer, files: string[]): void;
/**
* Returns the size of the vocabulary
*
@ -49,12 +111,30 @@ export class Tokenizer {
*/
runningTasks(): number;
getModel(): Model;
/**
* Change the model to use with this Tokenizer
* @param model New model to use
* @throws Will throw an error if any task is running
*/
setModel(model: Model): void;
getNormalizer(): Normalizer;
setNormalizer(normalizer: Normalizer): void;
getPreTokenizer(): PreTokenizer;
setPreTokenizer(preTokenizer: PreTokenizer): void;
getPostProcessor(): PostProcessor;
setPostProcessor(processor: PostProcessor): void;
getDecoder(): Decoder;
setDecoder(decoder: Decoder): void;
}
/**
@ -99,7 +179,7 @@ declare class Encoding {
/**
* Pad the current Encoding at the given length
*
* @param {number} length The length at which to pad
* @param length The length at which to pad
* @param {PaddingOptions} [options] Padding options
*/
pad(length: number, options?: PaddingOptions): void;
@ -107,7 +187,7 @@ declare class Encoding {
/**
* Truncate the current Encoding at the given max_length
*
* @param {number} length The maximum length to be kept
* @param length The maximum length to be kept
* @param {number} [stride=0] The length of the previous first sequence
* to be includedin the overflowing sequence
*/