diff --git a/bindings/python/CHANGELOG.md b/bindings/python/CHANGELOG.md index e0c7fea8..02c8afc6 100644 --- a/bindings/python/CHANGELOG.md +++ b/bindings/python/CHANGELOG.md @@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - [#542]: Add Split pre-tokenizer to easily split using a pattern +### Changed +- [#530]: The various attributes on each component can be get/set + ## [0.9.4] ### Fixed @@ -276,6 +279,7 @@ delimiter (Works like `.split(delimiter)`) [#542]: https://github.com/huggingface/tokenizers/pull/542 +[#530]: https://github.com/huggingface/tokenizers/pull/530 [#506]: https://github.com/huggingface/tokenizers/pull/506 [#500]: https://github.com/huggingface/tokenizers/pull/500 [#498]: https://github.com/huggingface/tokenizers/pull/498 diff --git a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi index 86cb343f..b4764e12 100644 --- a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi @@ -417,12 +417,37 @@ class Split(PreTokenizer): pass def pre_tokenize(self, pretok): """ - Pre tokenize the given PreTokenizedString in-place + Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place + + This method allows to modify a :class:`~tokenizers.PreTokenizedString` to + keep track of the pre-tokenization, and leverage the capabilities of the + :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of + the pre-tokenization of a raw string, you can use + :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str` + + Args: + pretok (:class:`~tokenizers.PreTokenizedString): + The pre-tokenized string on which to apply this + :class:`~tokenizers.pre_tokenizers.PreTokenizer` """ pass def pre_tokenize_str(self, sequence): """ - Pre tokenize the given sequence + Pre tokenize the given string + + This method provides a way to visualize the effect of a + :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the + alignment, nor does it provide all the capabilities of the + :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use + :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize` + + Args: + sequence (:obj:`str`): + A string to pre-tokeize + + Returns: + :obj:`List[Tuple[str, Offsets]]`: + A list of tuple with the pre-tokenized parts and their offsets """ pass