Doc - Update API Reference on more Tokenizer methods

2025-08-23 00:35:35 +00:00 · 2020-10-06 22:15:38 -04:00
parent ddabe130cd
commit 8c0370657e
2 changed files with 134 additions and 12 deletions
--- a/bindings/python/src/tokenizer.rs
+++ b/bindings/python/src/tokenizer.rs
@ -47,8 +47,8 @@ use crate::processors::PyPostProcessor;
 ///         It works just like :obj:`lstrip` but on the right.
 ///
 ///     normalized (:obj:`bool`, defaults to :obj:`True` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`):
-///         Defines whether this token should match the normalized version of the input text.
+///         Defines whether this token should match against the normalized version of the input
-///         For example, with the added token ``"yesterday"``, and a normalizer in charge of
+///         text. For example, with the added token ``"yesterday"``, and a normalizer in charge of
 ///         lowercasing the text, the token could be extract from the input ``"I saw a lion
 ///         Yesterday"``.
 ///
@ -111,7 +111,7 @@ impl PyAddedToken {
 impl PyAddedToken {
    #[new]
    #[args(kwargs = "**")]
-    fn new(content: Option<&str>, kwargs: Option<&PyDict>) -> PyResult<Self> {
+    fn __new__(content: Option<&str>, kwargs: Option<&PyDict>) -> PyResult<Self> {
        let mut token = PyAddedToken::from(content.unwrap_or(""), None);
        if let Some(kwargs) = kwargs {
@ -154,31 +154,31 @@ impl PyAddedToken {
        }
    }
-    /// Get the content attribute
+    /// Get the content of this :obj:`AddedToken`
    #[getter]
    fn get_content(&self) -> &str {
        &self.content
    }
-    /// Get the value of the :obj:`rstrip` attribute
+    /// Get the value of the :obj:`rstrip` option
    #[getter]
    fn get_rstrip(&self) -> bool {
        self.get_token().rstrip
    }
-    /// Get the value of the :obj:`lstrip` attribute
+    /// Get the value of the :obj:`lstrip` option
    #[getter]
    fn get_lstrip(&self) -> bool {
        self.get_token().lstrip
    }
-    /// Get the value of the :obj:`single_word` attribute
+    /// Get the value of the :obj:`single_word` option
    #[getter]
    fn get_single_word(&self) -> bool {
        self.get_token().single_word
    }
-    /// Get the value of the :obj:`normalized` attribute
+    /// Get the value of the :obj:`normalized` option
    #[getter]
    fn get_normalized(&self) -> bool {
        self.get_token().normalized
@ -400,6 +400,13 @@ impl<'s> From<PreTokenizedEncodeInput<'s>> for tk::tokenizer::EncodeInput<'s> {
 type Tokenizer = TokenizerImpl<PyModel, PyNormalizer, PyPreTokenizer, PyPostProcessor, PyDecoder>;
 /// A :obj:`Tokenizer` works as a pipeline. It processes some raw text as input
 /// and outputs an :class:`~tokenizers.Encoding`.
 ///
 /// Args:
 ///     model (:class:`~tokenizers.models.Model`):
 ///         The core algorithm that this :obj:`Tokenizer` should be using.
 ///
 #[pyclass(dict, module = "tokenizers", name=Tokenizer)]
 #[derive(Clone)]
 pub struct PyTokenizer {
@ -454,19 +461,48 @@ impl PyTokenizer {
        Ok(args)
    }
    /// Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string.
    ///
    /// Args:
    ///     json (:obj:`str`):
    ///         A valid JSON string representing a previously serialized
    ///         :class:`~tokenizers.Tokenizer`
    ///
    /// Returns:
    ///     :class:`~tokenizers.Tokenizer`: The new tokenizer
    #[staticmethod]
-    fn from_str(s: &str) -> PyResult<Self> {
+    #[text_signature = "(json)"]
-        let tokenizer: PyResult<_> = ToPyResult(s.parse()).into();
+    fn from_str(json: &str) -> PyResult<Self> {
        let tokenizer: PyResult<_> = ToPyResult(json.parse()).into();
        Ok(Self::new(tokenizer?))
    }
    /// Instantiate a new :class:`~tokenizers.Tokenizer` from the file at the given path.
    ///
    /// Args:
    ///     path (:obj:`str`):
    ///         A path to a local JSON file representing a previously serialized
    ///         :class:`~tokenizers.Tokenizer`
    ///
    /// Returns:
    ///     :class:`~tokenizers.Tokenizer`: The new tokenizer
    #[staticmethod]
    #[text_signature = "(path)"]
    fn from_file(path: &str) -> PyResult<Self> {
        let tokenizer: PyResult<_> = ToPyResult(Tokenizer::from_file(path)).into();
        Ok(Self::new(tokenizer?))
    }
    /// Instantiate a new :class:`~tokenizers.Tokenizer` from the given buffer.
    ///
    /// Args:
    ///     buffer (:obj:`bytes`):
    ///         A buffer containing a previously serialized :class:`~tokenizers.Tokenizer`
    ///
    /// Returns:
    ///     :class:`~tokenizers.Tokenizer`: The new tokenizer
    #[staticmethod]
    #[text_signature = "(buffer)"]
    fn from_buffer(buffer: &PyBytes) -> PyResult<Self> {
        let tokenizer = serde_json::from_slice(buffer.as_bytes()).map_err(|e| {
            exceptions::PyValueError::new_err(format!(
@ -477,12 +513,30 @@ impl PyTokenizer {
        Ok(Self { tokenizer })
    }
    /// Gets a serialized string representing this :class:`~tokenizers.Tokenizer`.
    ///
    /// Args:
    ///     pretty (:obj:`bool`, defaults to :obj:`False`):
    ///         Whether the JSON string should be pretty formatted.
    ///
    /// Returns:
    ///     :obj:`str`: A string representing the serialized Tokenizer
    #[args(pretty = false)]
    #[text_signature = "($self, pretty=False)"]
    fn to_str(&self, pretty: bool) -> PyResult<String> {
        ToPyResult(self.tokenizer.to_string(pretty)).into()
    }
    /// Save the :class:`~tokenizers.Tokenizer` to the file at the given path.
    ///
    /// Args:
    ///     path (:obj:`str`):
    ///         A path to a file in which to save the serialized tokenizer.
    ///
    ///     pretty (:obj:`bool`, defaults to :obj:`False):
    ///         Whether the JSON file should be pretty formatted.
    #[args(pretty = false)]
    #[text_signature = "($self, pretty=False)"]
    fn save(&self, path: &str, pretty: bool) -> PyResult<()> {
        ToPyResult(self.tokenizer.save(path, pretty)).into()
    }
@ -494,17 +548,49 @@ impl PyTokenizer {
            .map_or(0, |p| p.added_tokens(is_pair)))
    }
    /// Get the underlying vocabulary
    ///
    /// Args:
    ///     with_added_tokens (:obj:`bool, defaults to :obj:`True`):
    ///         Whether to include the added tokens
    ///
    /// Returns:
    ///     :obj:`Dict[str, int]`: The vocabulary
    #[args(with_added_tokens = true)]
    #[text_signature = "($self, with_added_tokens=True)"]
    fn get_vocab(&self, with_added_tokens: bool) -> PyResult<HashMap<String, u32>> {
        Ok(self.tokenizer.get_vocab(with_added_tokens))
    }
    /// Get the size of the underlying vocabulary
    ///
    /// Args:
    ///     with_added_tokens (:obj:`bool, defaults to :obj:`True`):
    ///         Whether to include the added tokens
    ///
    /// Returns:
    ///     :obj:`int`: The size of the vocabulary
    #[args(with_added_tokens = true)]
    #[text_signature = "($self, with_added_tokens=True)"]
    fn get_vocab_size(&self, with_added_tokens: bool) -> PyResult<usize> {
        Ok(self.tokenizer.get_vocab_size(with_added_tokens))
    }
    /// Enable truncation
    ///
    /// Args:
    ///     max_length (:obj:`int`):
    ///         The max length at which to truncate
    ///
    ///     stride (:obj:`int`, `optional`):
    ///         The length of the previous first sequence to be included in the overflowing
    ///         sequence
    ///
    ///     strategy (:obj:`str`, `optional`, defaults to :obj:`longest_first`):
    ///         The strategy used to truncation. Can be one of ``longest_first``, ``only_first`` or
    ///         ``only_second``.
    #[args(kwargs = "**")]
    #[text_signature = "($self, max_length, stride=0, strategy='longest_first')"]
    fn enable_truncation(&mut self, max_length: usize, kwargs: Option<&PyDict>) -> PyResult<()> {
        let mut params = TruncationParams::default();
        params.max_length = max_length;
@ -538,10 +624,17 @@ impl PyTokenizer {
        Ok(())
    }
    /// Disable truncation
    #[text_signature = "($self)"]
    fn no_truncation(&mut self) {
        self.tokenizer.with_truncation(None);
    }
    /// Get the currently set truncation parameters
    ///
    /// Returns:
    ///     (:obj:`dict`, `optional`):
    ///         A dict with the current truncation parameters if truncation is enabled
    #[getter]
    fn get_truncation<'py>(&self, py: Python<'py>) -> PyResult<Option<&'py PyDict>> {
        self.tokenizer.get_truncation().map_or(Ok(None), |params| {
@ -555,7 +648,31 @@ impl PyTokenizer {
        })
    }
    /// Enable the padding
    ///
    /// Args:
    ///     direction (:obj:`str`, `optional`, defaults to :obj:`right`):
    ///         The direction in which to pad. Can be either ``right`` or ``left``
    ///
    ///     pad_to_multiple_of (:obj:`int`, `optional`):
    ///         If specified, the padding length should always snap to the next multiple of the
    ///         given value. For example if we were going to pad witha length of 250 but
    ///         ``pad_to_multiple_of=8`` then we will pad to 256.
    ///
    ///     pad_id (:obj:`int`, defaults to 0):
    ///         The id to be used when padding
    ///
    ///     pad_type_id (:obj:`int`, defaults to 0):
    ///         The type id to be used when padding
    ///
    ///     pad_token (:obj:`str`, defaults to :obj:`[PAD]`):
    ///         The pad token to be used when padding
    ///
    ///     length (:obj:`int`, `optional`):
    ///         If specified, the length at which to pad. If not specified we pad using the size of
    ///         the longest sequence in a batch.
    #[args(kwargs = "**")]
    #[text_signature = "($self, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]', length=None, pad_to_multiple_of=None)"]
    fn enable_padding(&mut self, kwargs: Option<&PyDict>) -> PyResult<()> {
        let mut params = PaddingParams::default();
@ -612,10 +729,17 @@ impl PyTokenizer {
        Ok(())
    }
    /// Disable padding
    #[text_signature = "($self)"]
    fn no_padding(&mut self) {
        self.tokenizer.with_padding(None);
    }
    /// Get the current padding parameters
    ///
    /// Returns:
    ///     (:obj:`dict`, `optional`):
    ///         A dict with the current padding parameters if padding is enabled
    #[getter]
    fn get_padding<'py>(&self, py: Python<'py>) -> PyResult<Option<&'py PyDict>> {
        self.tokenizer.get_padding().map_or(Ok(None), |params| {
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -22,8 +22,6 @@ Main features:
   the part of the original sentence that corresponds to any token.
 - Does all the pre-processing: Truncation, Padding, add the special tokens your model needs.
 Components:
 ----------------------------------------------------------------------------------------------------
 .. toctree::
    :maxdepth: 2