mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-10 06:38:29 +00:00
Python - Update CHANGELOG and stub files
This commit is contained in:
@@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||||||
### Added
|
### Added
|
||||||
- [#542]: Add Split pre-tokenizer to easily split using a pattern
|
- [#542]: Add Split pre-tokenizer to easily split using a pattern
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
- [#530]: The various attributes on each component can be get/set
|
||||||
|
|
||||||
## [0.9.4]
|
## [0.9.4]
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
@@ -276,6 +279,7 @@ delimiter (Works like `.split(delimiter)`)
|
|||||||
|
|
||||||
|
|
||||||
[#542]: https://github.com/huggingface/tokenizers/pull/542
|
[#542]: https://github.com/huggingface/tokenizers/pull/542
|
||||||
|
[#530]: https://github.com/huggingface/tokenizers/pull/530
|
||||||
[#506]: https://github.com/huggingface/tokenizers/pull/506
|
[#506]: https://github.com/huggingface/tokenizers/pull/506
|
||||||
[#500]: https://github.com/huggingface/tokenizers/pull/500
|
[#500]: https://github.com/huggingface/tokenizers/pull/500
|
||||||
[#498]: https://github.com/huggingface/tokenizers/pull/498
|
[#498]: https://github.com/huggingface/tokenizers/pull/498
|
||||||
|
|||||||
@@ -417,12 +417,37 @@ class Split(PreTokenizer):
|
|||||||
pass
|
pass
|
||||||
def pre_tokenize(self, pretok):
|
def pre_tokenize(self, pretok):
|
||||||
"""
|
"""
|
||||||
Pre tokenize the given PreTokenizedString in-place
|
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||||
|
|
||||||
|
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
||||||
|
keep track of the pre-tokenization, and leverage the capabilities of the
|
||||||
|
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
||||||
|
the pre-tokenization of a raw string, you can use
|
||||||
|
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pretok (:class:`~tokenizers.PreTokenizedString):
|
||||||
|
The pre-tokenized string on which to apply this
|
||||||
|
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def pre_tokenize_str(self, sequence):
|
def pre_tokenize_str(self, sequence):
|
||||||
"""
|
"""
|
||||||
Pre tokenize the given sequence
|
Pre tokenize the given string
|
||||||
|
|
||||||
|
This method provides a way to visualize the effect of a
|
||||||
|
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
||||||
|
alignment, nor does it provide all the capabilities of the
|
||||||
|
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
||||||
|
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sequence (:obj:`str`):
|
||||||
|
A string to pre-tokeize
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`List[Tuple[str, Offsets]]`:
|
||||||
|
A list of tuple with the pre-tokenized parts and their offsets
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user