mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-08 13:48:19 +00:00
Add CTC Decoder for Wave2Vec models (#693)
* Rust - add a CTCDecoder as a seperate mod * Adding bindings to Node + Python. * Clippy update. * Stub. * Fixing roberta.json URLs. * Moving test files to hf.co. * Update cargo check and clippy to 1.52. * Inner ':' actually is used for domains in sphinx. Making `domain` work correctly was just too much work so I went the easy way and have global roles for the custom rust extension. * Update struct naming and docs * Update changelog Co-authored-by: Thomaub <github.thomaub@gmail.com> Co-authored-by: Anthony MOI <m.anthony.moi@gmail.com>
This commit is contained in:
@@ -5,3 +5,4 @@ ByteLevel = decoders.ByteLevel
|
||||
WordPiece = decoders.WordPiece
|
||||
Metaspace = decoders.Metaspace
|
||||
BPEDecoder = decoders.BPEDecoder
|
||||
CTC = decoders.CTC
|
||||
|
||||
@@ -68,6 +68,35 @@ class ByteLevel(Decoder):
|
||||
"""
|
||||
pass
|
||||
|
||||
class CTC(Decoder):
|
||||
"""
|
||||
CTC Decoder
|
||||
|
||||
Args:
|
||||
pad_token (:obj:`str`, `optional`, defaults to :obj:`<pad>`):
|
||||
The pad token used by CTC to delimit a new token.
|
||||
word_delimiter_token (:obj:`str`, `optional`, defaults to :obj:`|`):
|
||||
The word delimiter token. It will be replaced by a <space>
|
||||
cleanup (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether to cleanup some tokenization artifacts.
|
||||
Mainly spaces before punctuation, and some abbreviated english forms.
|
||||
"""
|
||||
|
||||
def __init__(self, pad_token="<pad>", word_delimiter_token="|", cleanup=True):
|
||||
pass
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of tokens to a final string
|
||||
|
||||
Args:
|
||||
tokens (:obj:`List[str]`):
|
||||
The list of tokens to decode
|
||||
|
||||
Returns:
|
||||
:obj:`str`: The decoded string
|
||||
"""
|
||||
pass
|
||||
|
||||
class Metaspace(Decoder):
|
||||
"""
|
||||
Metaspace Decoder
|
||||
|
||||
Reference in New Issue
Block a user