mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-09 06:08:22 +00:00
Python - Update CHANGELOG and stub files
This commit is contained in:
@@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
### Added
|
||||
- [#542]: Add Split pre-tokenizer to easily split using a pattern
|
||||
|
||||
### Changed
|
||||
- [#530]: The various attributes on each component can be get/set
|
||||
|
||||
## [0.9.4]
|
||||
|
||||
### Fixed
|
||||
@@ -276,6 +279,7 @@ delimiter (Works like `.split(delimiter)`)
|
||||
|
||||
|
||||
[#542]: https://github.com/huggingface/tokenizers/pull/542
|
||||
[#530]: https://github.com/huggingface/tokenizers/pull/530
|
||||
[#506]: https://github.com/huggingface/tokenizers/pull/506
|
||||
[#500]: https://github.com/huggingface/tokenizers/pull/500
|
||||
[#498]: https://github.com/huggingface/tokenizers/pull/498
|
||||
|
||||
@@ -417,12 +417,37 @@ class Split(PreTokenizer):
|
||||
pass
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
||||
keep track of the pre-tokenization, and leverage the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
||||
the pre-tokenization of a raw string, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
||||
|
||||
Args:
|
||||
pretok (:class:`~tokenizers.PreTokenizedString):
|
||||
The pre-tokenized string on which to apply this
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
Pre tokenize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
||||
alignment, nor does it provide all the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to pre-tokeize
|
||||
|
||||
Returns:
|
||||
:obj:`List[Tuple[str, Offsets]]`:
|
||||
A list of tuple with the pre-tokenized parts and their offsets
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
Reference in New Issue
Block a user