mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
* Adding ByteFallback support for `tokenizers`. Two items added: - A flag `byte_fallback` for the `BPE` model. This will be in charge of using `<0x61>` instead of unk on unknown tokens. - A ByteFallback decoder, which will be in charge of putting everything back into string whenever possible. Showing � when the byte decoding fails (behavior checked against LlamaTokenizer in `transformers`. * Update rustdoc. * Clippy + Add BPE(byte_fallback) into bindings. * Stupid file. * Test artifacts removed. * Update stub. * Fix. * Bad file. * CRITICAL FIX: wrapper order because of untagged.... * Remove prints. * Fixing <16 byte fallback.
66 lines
2.4 KiB
TypeScript
66 lines
2.4 KiB
TypeScript
/**
|
||
* This class is not supposed to be instantiated directly. Instead, any implementation of
|
||
* a Decoder will return an instance of this class when instantiated.
|
||
*/
|
||
// eslint-disable-next-line @typescript-eslint/no-empty-interface
|
||
interface Decoder {
|
||
decode(tokens: string[]): string;
|
||
}
|
||
|
||
/**
|
||
* Instantiate a new ByteLevel Decoder
|
||
*/
|
||
export function byteLevelDecoder(): Decoder;
|
||
|
||
/**
|
||
* Instantiate a new WordPiece Decoder
|
||
* @param [prefix='##'] The prefix to use for subwords that are not a beginning-of-word
|
||
* @param [cleanup=true] Whether to cleanup some tokenization artifacts.
|
||
* Mainly spaces before punctuation, and some abbreviated english forms.
|
||
*/
|
||
export function wordPieceDecoder(prefix?: string, cleanup?: boolean): Decoder;
|
||
|
||
/**
|
||
* Instantiate a new ByteFallback Decoder
|
||
* ByteFallback is a simple trick which converts tokens looking like `<0x61>`
|
||
* to pure bytes, and attempts to make them into a string. If the tokens
|
||
* cannot be decoded you will get <20> instead for each inconvertable byte token
|
||
*/
|
||
export function byteFallbackDecoder(): Decoder;
|
||
|
||
/**
|
||
* Instantiate a new Metaspace
|
||
*
|
||
* @param [replacement='▁'] The replacement character.
|
||
* Must be exactly one character. By default we use the `▁` (U+2581) meta symbol (same as in SentencePiece).
|
||
* @param [addPrefixSpace=true] Whether to add a space to the first word if there isn't already one.
|
||
* This lets us treat `hello` exactly like `say hello`.
|
||
*/
|
||
export function metaspaceDecoder(replacement?: string, addPrefixSpace?: boolean): Decoder;
|
||
|
||
/**
|
||
* Instantiate a new BPE Decoder
|
||
* @param [suffix='</w>'] The suffix that was used to characterize an end-of-word.
|
||
* This suffix will be replaced by whitespaces during the decoding
|
||
*/
|
||
export function bpeDecoder(suffix?: string): Decoder;
|
||
|
||
/**
|
||
* Instantiate a new CTC Decoder
|
||
* @param [pad_token='pad'] The pad token used by CTC to delimit a new token.
|
||
* @param [word_delimiter_token='|'] The word delimiter token. It will be replaced by a space
|
||
* @param [cleanup=true] Whether to cleanup some tokenization artifacts.
|
||
* Mainly spaces before punctuation, and some abbreviated english forms.
|
||
*/
|
||
export function ctcDecoder(
|
||
pad_token?: string,
|
||
word_delimiter_token?: string,
|
||
cleanup?: boolean
|
||
): Decoder;
|
||
|
||
/**
|
||
* Instantiate a new Sequence Decoder
|
||
* @param [decoders] The decoders to chain
|
||
*/
|
||
export function sequenceDecoder(decoders: Decoder[]): Decoder;
|