mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Fix doc about split (#1591)
* update doc * add example * Update bindings/python/src/pre_tokenizers.rs Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com> * stub --------- Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
@ -421,8 +421,11 @@ class Split(PreTokenizer):
|
||||
|
||||
Args:
|
||||
pattern (:obj:`str` or :class:`~tokenizers.Regex`):
|
||||
A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`
|
||||
|
||||
A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`.
|
||||
If you want to use a regex pattern, it has to be wrapped around a `tokenizer.Regex`,
|
||||
otherwise we consider is as a string pattern. For example `pattern="|"`
|
||||
means you want to split on `|` (imagine a csv file for example), while
|
||||
`patter=tokenizer.Regex("1|2")` means you split on either '1' or '2'.
|
||||
behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
|
||||
The behavior to use when splitting.
|
||||
Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
|
||||
|
@ -334,8 +334,11 @@ impl PyWhitespaceSplit {
|
||||
///
|
||||
/// Args:
|
||||
/// pattern (:obj:`str` or :class:`~tokenizers.Regex`):
|
||||
/// A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`
|
||||
///
|
||||
/// A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`.
|
||||
/// If you want to use a regex pattern, it has to be wrapped around a `tokenizer.Regex`,
|
||||
/// otherwise we consider is as a string pattern. For example `pattern="|"`
|
||||
/// means you want to split on `|` (imagine a csv file for example), while
|
||||
/// `patter=tokenizer.Regex("1|2")` means you split on either '1' or '2'.
|
||||
/// behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
|
||||
/// The behavior to use when splitting.
|
||||
/// Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
|
||||
|
Reference in New Issue
Block a user