Doc - Backbone for API Reference

2025-08-22 16:25:30 +00:00 · 2020-10-06 14:31:16 -04:00
parent 7eb0a9255e
commit 3ee54766e3
7 changed files with 69 additions and 16 deletions
--- a/bindings/python/src/tokenizer.rs
+++ b/bindings/python/src/tokenizer.rs
@ -602,15 +602,31 @@ impl PyTokenizer {
        })
    }

-    /// Input can be:
-    /// encode("A single sequence")
-    /// encode("A sequence", "And its pair")
-    /// encode([ "A", "pre", "tokenized", "sequence" ], is_pretokenized=True)
-    /// encode(
-    ///     [ "A", "pre", "tokenized", "sequence" ], [ "And", "its", "pair" ],
-    ///     is_pretokenized=True
-    /// )
+    /// Encode the given sequence and pair. This method can process raw text sequences
+    /// as well as already pre-tokenized sequences.
+    ///
+    /// Example:
+    ///     Here are some examples of the inputs that are accepted::
+    ///
+    ///         encode("A single sequence")`
+    ///         encode("A sequence", "And its pair")`
+    ///         encode([ "A", "pre", "tokenized", "sequence" ], is_pretokenized=True)`
+    ///         encode(
+    ///             [ "A", "pre", "tokenized", "sequence" ], [ "And", "its", "pair" ],
+    ///             is_pretokenized=True
+    ///         )
+    ///
+    /// Args:
+    ///     sequence (:obj:`~tokenizers.InputSequence`): The main input sequence
+    ///     pair: (:obj:`~tokenizers.InputSequence`): An optional input sequence
+    ///     is_pretokenized (:obj:`bool`): Whether the input is already pre-tokenized
+    ///     add_special_tokens (:obj:`bool`): Whether to add the special tokens
+    ///
+    /// Returns:
+    ///     :class:`~tokenizers.Encoding`: The encoded result
+    ///
    #[args(pair = "None", is_pretokenized = "false", add_special_tokens = "true")]
+    #[text_signature = "($self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True, /)"]
    fn encode(
        &self,
        sequence: &PyAny,
@ -643,14 +659,29 @@ impl PyTokenizer {
        .into()
    }

-    /// Input can be:
-    /// encode_batch([
-    ///   "A single sequence",
-    ///   ("A tuple with a sequence", "And its pair"),
-    ///   [ "A", "pre", "tokenized", "sequence" ],
-    ///   ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
-    /// ])
+    /// Encode the given batch of inputs. This method accept both raw text sequences
+    /// as well as already pre-tokenized sequences.
+    ///
+    /// Example:
+    ///     Here are some examples of the inputs that are accepted::
+    ///
+    ///         encode_batch([
+    ///             "A single sequence",
+    ///             ("A tuple with a sequence", "And its pair"),
+    ///             [ "A", "pre", "tokenized", "sequence" ],
+    ///             ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
+    ///         ])
+    ///
+    /// Args:
+    ///     input (:obj:`~tokenizers.EncodeInput`): The batch inputs
+    ///     is_pretokenized (:obj:`bool`): Whether the input is already pre-tokenized
+    ///     add_special_tokens (:obj:`bool`): Whether to add the special tokens
+    ///
+    /// Returns:
+    ///     :obj:`List[:class:`~tokenizers.Encoding`]`: The encoded batch
+    ///
    #[args(is_pretokenized = "false", add_special_tokens = "true")]
+    #[text_signature = "($self, input, is_pretokenized=False, add_special_tokens=True, /)"]
    fn encode_batch(
        &self,
        input: Vec<&PyAny>,
--- a/docs/source/api/node_reference.inc
+++ b/docs/source/api/node_reference.inc
@ -0,0 +1 @@
+The node API has not been documented yet.
--- a/docs/source/api/python_reference.inc
+++ b/docs/source/api/python_reference.inc
@ -0,0 +1,2 @@
+.. autoclass:: tokenizers.Tokenizer
+    :members:
--- a/docs/source/api/reference.rst
+++ b/docs/source/api/reference.rst
@ -0,0 +1,14 @@
+API Reference
+====================================================================================================
+
+.. only:: python
+
+    .. include:: python_reference.inc
+
+.. only:: rust
+
+    .. include:: rust_reference.inc
+
+.. only:: node
+
+    .. include:: node_reference.inc
--- a/docs/source/api/rust_reference.inc
+++ b/docs/source/api/rust_reference.inc
@ -0,0 +1 @@
+The Rust API has not been documented yet.
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -30,7 +30,7 @@ release = "0.9.0"
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
-extensions = []
+extensions = ["sphinx.ext.autodoc", "sphinx.ext.napoleon"]

 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["_templates"]
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -34,6 +34,10 @@ Components:
    pipeline
    components

+.. toctree::
+
+    api/reference
+
 Load an existing tokenizer:
 ----------------------------------------------------------------------------------------------------