mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
* version = "0.15.3-dev-0” Improve performances of meta space, but also just fix it. (transformers) ➜ transformers git:(refactor-default-llama) ✗ python ../scripts/gemma-dummy.py Token indices sequence length is longer than the specified maximum sequence length for this model (14999 > 2048). Running this sequence through the model will result in indexing errors ['<REPR_END>', '▁inform', '<s>', '.', '▁Hey', '<unk>', '.', '▁', '▁', '▁', '▁', '▁', '▁', '▁.'] ['▁inform', '<s>', '.', '▁Hey', '<unk>', '.', '▁', '▁', '▁', '▁', '▁', '▁', '▁.'] [0.0006330013275146484, 0.0014591217041015625, 0.015890836715698242, 0.18584918975830078, 2.1726326942443848] (transformers) ➜ transformers git:(refactor-default-llama) ✗ python ../scripts/gemma-dummy.py Token indices sequence length is longer than the specified maximum sequence length for this model (10000 > 2048). Running this sequence through the model will result in indexing errors ['<REPR_END>', 'in', 'form', '<s>', '.', '▁Hey', '<unk>', '.', '▁▁▁▁▁▁', '▁.'] ['in', 'form', '<s>', '.', '▁Hey', '<unk>', '.', '▁▁▁▁▁▁', '▁.'] [0.0008409023284912109, 0.0008909702301025391, 0.00882411003112793, 0.10214710235595703, 1.187899112701416] * well what do we have * nit * be BC with non legacy * unrelated change for clippy * fix test * splitting is a must for word_ids * fmt and lint * Fixing everything (hopefully better). * Fixing node. * Including yarn.lock * Lint. * Stubs. * revert to use split * fix merge issues * fix tests * finish fixing tests * ruff --------- Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
270 lines
6.9 KiB
Python
270 lines
6.9 KiB
Python
# Generated content DO NOT EDIT
|
||
class Decoder:
|
||
"""
|
||
Base class for all decoders
|
||
|
||
This class is not supposed to be instantiated directly. Instead, any implementation of
|
||
a Decoder will return an instance of this class when instantiated.
|
||
"""
|
||
def decode(self, tokens):
|
||
"""
|
||
Decode the given list of tokens to a final string
|
||
|
||
Args:
|
||
tokens (:obj:`List[str]`):
|
||
The list of tokens to decode
|
||
|
||
Returns:
|
||
:obj:`str`: The decoded string
|
||
"""
|
||
pass
|
||
|
||
class BPEDecoder(Decoder):
|
||
"""
|
||
BPEDecoder Decoder
|
||
|
||
Args:
|
||
suffix (:obj:`str`, `optional`, defaults to :obj:`</w>`):
|
||
The suffix that was used to caracterize an end-of-word. This suffix will
|
||
be replaced by whitespaces during the decoding
|
||
"""
|
||
def __init__(self, suffix="</w>"):
|
||
pass
|
||
|
||
def decode(self, tokens):
|
||
"""
|
||
Decode the given list of tokens to a final string
|
||
|
||
Args:
|
||
tokens (:obj:`List[str]`):
|
||
The list of tokens to decode
|
||
|
||
Returns:
|
||
:obj:`str`: The decoded string
|
||
"""
|
||
pass
|
||
|
||
class ByteFallback(Decoder):
|
||
"""
|
||
ByteFallback Decoder
|
||
ByteFallback is a simple trick which converts tokens looking like `<0x61>`
|
||
to pure bytes, and attempts to make them into a string. If the tokens
|
||
cannot be decoded you will get <20> instead for each inconvertable byte token
|
||
|
||
"""
|
||
def __init__(self):
|
||
pass
|
||
|
||
def decode(self, tokens):
|
||
"""
|
||
Decode the given list of tokens to a final string
|
||
|
||
Args:
|
||
tokens (:obj:`List[str]`):
|
||
The list of tokens to decode
|
||
|
||
Returns:
|
||
:obj:`str`: The decoded string
|
||
"""
|
||
pass
|
||
|
||
class ByteLevel(Decoder):
|
||
"""
|
||
ByteLevel Decoder
|
||
|
||
This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.ByteLevel`
|
||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`.
|
||
"""
|
||
def __init__(self):
|
||
pass
|
||
|
||
def decode(self, tokens):
|
||
"""
|
||
Decode the given list of tokens to a final string
|
||
|
||
Args:
|
||
tokens (:obj:`List[str]`):
|
||
The list of tokens to decode
|
||
|
||
Returns:
|
||
:obj:`str`: The decoded string
|
||
"""
|
||
pass
|
||
|
||
class CTC(Decoder):
|
||
"""
|
||
CTC Decoder
|
||
|
||
Args:
|
||
pad_token (:obj:`str`, `optional`, defaults to :obj:`<pad>`):
|
||
The pad token used by CTC to delimit a new token.
|
||
word_delimiter_token (:obj:`str`, `optional`, defaults to :obj:`|`):
|
||
The word delimiter token. It will be replaced by a <space>
|
||
cleanup (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||
Whether to cleanup some tokenization artifacts.
|
||
Mainly spaces before punctuation, and some abbreviated english forms.
|
||
"""
|
||
def __init__(self, pad_token="<pad>", word_delimiter_token="|", cleanup=True):
|
||
pass
|
||
|
||
def decode(self, tokens):
|
||
"""
|
||
Decode the given list of tokens to a final string
|
||
|
||
Args:
|
||
tokens (:obj:`List[str]`):
|
||
The list of tokens to decode
|
||
|
||
Returns:
|
||
:obj:`str`: The decoded string
|
||
"""
|
||
pass
|
||
|
||
class Fuse(Decoder):
|
||
"""
|
||
Fuse Decoder
|
||
Fuse simply fuses every token into a single string.
|
||
This is the last step of decoding, this decoder exists only if
|
||
there is need to add other decoders *after* the fusion
|
||
"""
|
||
def __init__(self):
|
||
pass
|
||
|
||
def decode(self, tokens):
|
||
"""
|
||
Decode the given list of tokens to a final string
|
||
|
||
Args:
|
||
tokens (:obj:`List[str]`):
|
||
The list of tokens to decode
|
||
|
||
Returns:
|
||
:obj:`str`: The decoded string
|
||
"""
|
||
pass
|
||
|
||
class Metaspace(Decoder):
|
||
"""
|
||
Metaspace Decoder
|
||
|
||
Args:
|
||
replacement (:obj:`str`, `optional`, defaults to :obj:`▁`):
|
||
The replacement character. Must be exactly one character. By default we
|
||
use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
|
||
|
||
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||
Whether to add a space to the first word if there isn't already one. This
|
||
lets us treat `hello` exactly like `say hello`.
|
||
"""
|
||
def __init__(self, replacement="▁", prepend_scheme="always", split=True):
|
||
pass
|
||
|
||
def decode(self, tokens):
|
||
"""
|
||
Decode the given list of tokens to a final string
|
||
|
||
Args:
|
||
tokens (:obj:`List[str]`):
|
||
The list of tokens to decode
|
||
|
||
Returns:
|
||
:obj:`str`: The decoded string
|
||
"""
|
||
pass
|
||
|
||
class Replace(Decoder):
|
||
"""
|
||
Replace Decoder
|
||
|
||
This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.Replace`
|
||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`.
|
||
"""
|
||
def __init__(self, pattern, content):
|
||
pass
|
||
|
||
def decode(self, tokens):
|
||
"""
|
||
Decode the given list of tokens to a final string
|
||
|
||
Args:
|
||
tokens (:obj:`List[str]`):
|
||
The list of tokens to decode
|
||
|
||
Returns:
|
||
:obj:`str`: The decoded string
|
||
"""
|
||
pass
|
||
|
||
class Sequence(Decoder):
|
||
"""
|
||
Sequence Decoder
|
||
|
||
Args:
|
||
decoders (:obj:`List[Decoder]`)
|
||
The decoders that need to be chained
|
||
"""
|
||
def __init__(self, decoders):
|
||
pass
|
||
|
||
def decode(self, tokens):
|
||
"""
|
||
Decode the given list of tokens to a final string
|
||
|
||
Args:
|
||
tokens (:obj:`List[str]`):
|
||
The list of tokens to decode
|
||
|
||
Returns:
|
||
:obj:`str`: The decoded string
|
||
"""
|
||
pass
|
||
|
||
class Strip(Decoder):
|
||
"""
|
||
Strip normalizer
|
||
Strips n left characters of each token, or n right characters of each token
|
||
"""
|
||
def __init__(self, content, left=0, right=0):
|
||
pass
|
||
|
||
def decode(self, tokens):
|
||
"""
|
||
Decode the given list of tokens to a final string
|
||
|
||
Args:
|
||
tokens (:obj:`List[str]`):
|
||
The list of tokens to decode
|
||
|
||
Returns:
|
||
:obj:`str`: The decoded string
|
||
"""
|
||
pass
|
||
|
||
class WordPiece(Decoder):
|
||
"""
|
||
WordPiece Decoder
|
||
|
||
Args:
|
||
prefix (:obj:`str`, `optional`, defaults to :obj:`##`):
|
||
The prefix to use for subwords that are not a beginning-of-word
|
||
|
||
cleanup (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||
Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
|
||
and some abbreviated english forms.
|
||
"""
|
||
def __init__(self, prefix="##", cleanup=True):
|
||
pass
|
||
|
||
def decode(self, tokens):
|
||
"""
|
||
Decode the given list of tokens to a final string
|
||
|
||
Args:
|
||
tokens (:obj:`List[str]`):
|
||
The list of tokens to decode
|
||
|
||
Returns:
|
||
:obj:`str`: The decoded string
|
||
"""
|
||
pass
|