Python - Update CHANGELOG and stub files

This commit is contained in:
Anthony MOI
2020-11-27 17:25:43 -05:00
committed by Anthony MOI
parent a351d1c604
commit 6e364cb685
2 changed files with 31 additions and 2 deletions

View File

@@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- [#542]: Add Split pre-tokenizer to easily split using a pattern
### Changed
- [#530]: The various attributes on each component can be get/set
## [0.9.4]
### Fixed
@@ -276,6 +279,7 @@ delimiter (Works like `.split(delimiter)`)
[#542]: https://github.com/huggingface/tokenizers/pull/542
[#530]: https://github.com/huggingface/tokenizers/pull/530
[#506]: https://github.com/huggingface/tokenizers/pull/506
[#500]: https://github.com/huggingface/tokenizers/pull/500
[#498]: https://github.com/huggingface/tokenizers/pull/498

View File

@@ -417,12 +417,37 @@ class Split(PreTokenizer):
pass
def pre_tokenize(self, pretok):
"""
Pre tokenize the given PreTokenizedString in-place
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
keep track of the pre-tokenization, and leverage the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
the pre-tokenization of a raw string, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
Args:
pretok (:class:`~tokenizers.PreTokenizedString):
The pre-tokenized string on which to apply this
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given sequence
Pre tokenize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
alignment, nor does it provide all the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
Args:
sequence (:obj:`str`):
A string to pre-tokeize
Returns:
:obj:`List[Tuple[str, Offsets]]`:
A list of tuple with the pre-tokenized parts and their offsets
"""
pass