Files
tokenizers/bindings/node/lib/bindings/decoders.d.ts
Nicolas Patry 73637a0004 Adding ByteFallback support for tokenizers. (#1183)
* Adding ByteFallback support for `tokenizers`.

Two items added:

- A flag `byte_fallback` for the `BPE` model. This will be in charge
  of using `<0x61>` instead of unk on unknown tokens.
- A ByteFallback decoder, which will be in charge of putting everything
  back into string whenever possible. Showing � when the byte decoding
  fails (behavior checked against LlamaTokenizer in `transformers`.

* Update rustdoc.

* Clippy + Add BPE(byte_fallback) into bindings.

* Stupid file.

* Test artifacts removed.

* Update stub.

* Fix.

* Bad file.

* CRITICAL FIX: wrapper order because of untagged....

* Remove prints.

* Fixing <16 byte fallback.
2023-03-23 16:04:32 +01:00

66 lines
2.4 KiB
TypeScript
Raw Blame History

/**
* This class is not supposed to be instantiated directly. Instead, any implementation of
* a Decoder will return an instance of this class when instantiated.
*/
// eslint-disable-next-line @typescript-eslint/no-empty-interface
interface Decoder {
decode(tokens: string[]): string;
}
/**
* Instantiate a new ByteLevel Decoder
*/
export function byteLevelDecoder(): Decoder;
/**
* Instantiate a new WordPiece Decoder
* @param [prefix='##'] The prefix to use for subwords that are not a beginning-of-word
* @param [cleanup=true] Whether to cleanup some tokenization artifacts.
* Mainly spaces before punctuation, and some abbreviated english forms.
*/
export function wordPieceDecoder(prefix?: string, cleanup?: boolean): Decoder;
/**
* Instantiate a new ByteFallback Decoder
* ByteFallback is a simple trick which converts tokens looking like `<0x61>`
* to pure bytes, and attempts to make them into a string. If the tokens
* cannot be decoded you will get <20> instead for each inconvertable byte token
*/
export function byteFallbackDecoder(): Decoder;
/**
* Instantiate a new Metaspace
*
* @param [replacement='▁'] The replacement character.
* Must be exactly one character. By default we use the `▁` (U+2581) meta symbol (same as in SentencePiece).
* @param [addPrefixSpace=true] Whether to add a space to the first word if there isn't already one.
* This lets us treat `hello` exactly like `say hello`.
*/
export function metaspaceDecoder(replacement?: string, addPrefixSpace?: boolean): Decoder;
/**
* Instantiate a new BPE Decoder
* @param [suffix='</w>'] The suffix that was used to characterize an end-of-word.
* This suffix will be replaced by whitespaces during the decoding
*/
export function bpeDecoder(suffix?: string): Decoder;
/**
* Instantiate a new CTC Decoder
* @param [pad_token='pad'] The pad token used by CTC to delimit a new token.
* @param [word_delimiter_token='|'] The word delimiter token. It will be replaced by a space
* @param [cleanup=true] Whether to cleanup some tokenization artifacts.
* Mainly spaces before punctuation, and some abbreviated english forms.
*/
export function ctcDecoder(
pad_token?: string,
word_delimiter_token?: string,
cleanup?: boolean
): Decoder;
/**
* Instantiate a new Sequence Decoder
* @param [decoders] The decoders to chain
*/
export function sequenceDecoder(decoders: Decoder[]): Decoder;