Add CTC Decoder for Wave2Vec models (#693)

* Rust - add a CTCDecoder as a seperate mod * Adding bindings to Node + Python. * Clippy update. * Stub. * Fixing roberta.json URLs. * Moving test files to hf.co. * Update cargo check and clippy to 1.52. * Inner ':' actually is used for domains in sphinx. Making `domain` work correctly was just too much work so I went the easy way and have global roles for the custom rust extension. * Update struct naming and docs * Update changelog Co-authored-by: Thomaub <github.thomaub@gmail.com> Co-authored-by: Anthony MOI <m.anthony.moi@gmail.com>
2025-12-08 13:48:19 +00:00 · 2021-05-20 15:30:09 +02:00
parent e999a7b5f9
commit 2e2e7558f7
24 changed files with 353 additions and 50 deletions
--- a/bindings/python/py_src/tokenizers/decoders/init.py
+++ b/bindings/python/py_src/tokenizers/decoders/init.py
@@ -5,3 +5,4 @@ ByteLevel = decoders.ByteLevel
 WordPiece = decoders.WordPiece
 Metaspace = decoders.Metaspace
 BPEDecoder = decoders.BPEDecoder
+CTC = decoders.CTC
--- a/bindings/python/py_src/tokenizers/decoders/init.pyi
+++ b/bindings/python/py_src/tokenizers/decoders/init.pyi
@@ -68,6 +68,35 @@ class ByteLevel(Decoder):
        """
        pass

+class CTC(Decoder):
+    """
+    CTC Decoder
+
+    Args:
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`<pad>`):
+            The pad token used by CTC to delimit a new token.
+        word_delimiter_token (:obj:`str`, `optional`, defaults to :obj:`|`):
+            The word delimiter token. It will be replaced by a <space>
+        cleanup (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to cleanup some tokenization artifacts.
+            Mainly spaces before punctuation, and some abbreviated english forms.
+    """
+
+    def __init__(self, pad_token="<pad>", word_delimiter_token="|", cleanup=True):
+        pass
+    def decode(self, tokens):
+        """
+        Decode the given list of tokens to a final string
+
+        Args:
+            tokens (:obj:`List[str]`):
+                The list of tokens to decode
+
+        Returns:
+            :obj:`str`: The decoded string
+        """
+        pass
+
 class Metaspace(Decoder):
    """
    Metaspace Decoder