Python - Update CHANGELOG and stub files

2025-12-09 06:08:22 +00:00 · 2020-11-27 17:25:43 -05:00
parent a351d1c604
commit 6e364cb685
2 changed files with 31 additions and 2 deletions
--- a/bindings/python/CHANGELOG.md
+++ b/bindings/python/CHANGELOG.md
@@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 - [#542]: Add Split pre-tokenizer to easily split using a pattern

+### Changed
+- [#530]: The various attributes on each component can be get/set
+
 ## [0.9.4]

 ### Fixed
@@ -276,6 +279,7 @@ delimiter (Works like `.split(delimiter)`)


 [#542]: https://github.com/huggingface/tokenizers/pull/542
+[#530]: https://github.com/huggingface/tokenizers/pull/530
 [#506]: https://github.com/huggingface/tokenizers/pull/506
 [#500]: https://github.com/huggingface/tokenizers/pull/500
 [#498]: https://github.com/huggingface/tokenizers/pull/498
--- a/bindings/python/py_src/tokenizers/pre_tokenizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/pre_tokenizers/init.pyi
@@ -417,12 +417,37 @@ class Split(PreTokenizer):
        pass
    def pre_tokenize(self, pretok):
        """
-        Pre tokenize the given PreTokenizedString in-place
+        Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+
+        This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+        keep track of the pre-tokenization, and leverage the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+        the pre-tokenization of a raw string, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+
+        Args:
+            pretok (:class:`~tokenizers.PreTokenizedString):
+                The pre-tokenized string on which to apply this
+                :class:`~tokenizers.pre_tokenizers.PreTokenizer`
        """
        pass
    def pre_tokenize_str(self, sequence):
        """
-        Pre tokenize the given sequence
+        Pre tokenize the given string
+
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+        alignment, nor does it provide all the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
+
+        Args:
+            sequence (:obj:`str`):
+                A string to pre-tokeize
+
+        Returns:
+            :obj:`List[Tuple[str, Offsets]]`:
+                A list of tuple with the pre-tokenized parts and their offsets
        """
        pass