Fix doc about split (#1591)

* update doc

* add example

* Update bindings/python/src/pre_tokenizers.rs

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>

* stub

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
Arthur
2024-08-07 12:35:01 +02:00
committed by GitHub
parent 6a5fce9fa0
commit eea8e1ae6f
2 changed files with 10 additions and 4 deletions

View File

@ -421,8 +421,11 @@ class Split(PreTokenizer):
Args:
pattern (:obj:`str` or :class:`~tokenizers.Regex`):
A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`
A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`.
If you want to use a regex pattern, it has to be wrapped around a `tokenizer.Regex`,
otherwise we consider is as a string pattern. For example `pattern="|"`
means you want to split on `|` (imagine a csv file for example), while
`patter=tokenizer.Regex("1|2")` means you split on either '1' or '2'.
behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
The behavior to use when splitting.
Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",

View File

@ -334,8 +334,11 @@ impl PyWhitespaceSplit {
///
/// Args:
/// pattern (:obj:`str` or :class:`~tokenizers.Regex`):
/// A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`
///
/// A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`.
/// If you want to use a regex pattern, it has to be wrapped around a `tokenizer.Regex`,
/// otherwise we consider is as a string pattern. For example `pattern="|"`
/// means you want to split on `|` (imagine a csv file for example), while
/// `patter=tokenizer.Regex("1|2")` means you split on either '1' or '2'.
/// behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
/// The behavior to use when splitting.
/// Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",