mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Fixing doc. (#1499)
* Fixing doc. * SentencePieceUnigram and Convert.py still used sentencepiece * stub --------- Co-authored-by: Arthur Zucker <arthur.zucker@gmail.com>
This commit is contained in:
@ -152,9 +152,11 @@ class Metaspace(Decoder):
|
|||||||
The replacement character. Must be exactly one character. By default we
|
The replacement character. Must be exactly one character. By default we
|
||||||
use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
|
use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
|
||||||
|
|
||||||
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
prepend_scheme (:obj:`str`, `optional`, defaults to :obj:`"always"`):
|
||||||
Whether to add a space to the first word if there isn't already one. This
|
Whether to add a space to the first word if there isn't already one. This
|
||||||
lets us treat `hello` exactly like `say hello`.
|
lets us treat `hello` exactly like `say hello`.
|
||||||
|
Choices: "always", "never", "first". First means the space is only added on the first
|
||||||
|
token (relevant when special tokens are used or other pre_tokenizer are used).
|
||||||
"""
|
"""
|
||||||
def __init__(self, replacement="▁", prepend_scheme="always", split=True):
|
def __init__(self, replacement="▁", prepend_scheme="always", split=True):
|
||||||
pass
|
pass
|
||||||
|
@ -183,8 +183,9 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
tokenizer.normalizer = normalizers.Sequence([normalizers.Replace(Regex(" {2,}"), " ")])
|
tokenizer.normalizer = normalizers.Sequence([normalizers.Replace(Regex(" {2,}"), " ")])
|
||||||
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
|
prepend_scheme = "always" if add_prefix_space else "never"
|
||||||
tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
|
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
|
||||||
|
tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
|
||||||
|
|
||||||
parameters = {
|
parameters = {
|
||||||
"model": "SentencePieceUnigram",
|
"model": "SentencePieceUnigram",
|
||||||
|
@ -270,9 +270,12 @@ class Metaspace(PreTokenizer):
|
|||||||
The replacement character. Must be exactly one character. By default we
|
The replacement character. Must be exactly one character. By default we
|
||||||
use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
|
use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
|
||||||
|
|
||||||
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
prepend_scheme (:obj:`str`, `optional`, defaults to :obj:`"always"`):
|
||||||
Whether to add a space to the first word if there isn't already one. This
|
Whether to add a space to the first word if there isn't already one. This
|
||||||
lets us treat `hello` exactly like `say hello`.
|
lets us treat `hello` exactly like `say hello`.
|
||||||
|
Choices: "always", "never", "first". First means the space is only added on the first
|
||||||
|
token (relevant when special tokens are used or other pre_tokenizer are used).
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, replacement="_", prepend_scheme="always", split=True):
|
def __init__(self, replacement="_", prepend_scheme="always", split=True):
|
||||||
pass
|
pass
|
||||||
|
@ -102,9 +102,9 @@ class SpmConverter(Converter):
|
|||||||
tokenizer.normalizer = self.normalizer(self.proto)
|
tokenizer.normalizer = self.normalizer(self.proto)
|
||||||
|
|
||||||
replacement = "▁"
|
replacement = "▁"
|
||||||
add_prefix_space = True
|
prepend_scheme = "always"
|
||||||
tokenizer.pre_tokenizer = Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
|
tokenizer.pre_tokenizer = Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
|
||||||
tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
|
tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
|
||||||
post_processor = self.post_processor(tokenizer)
|
post_processor = self.post_processor(tokenizer)
|
||||||
if post_processor:
|
if post_processor:
|
||||||
tokenizer.post_processor = post_processor
|
tokenizer.post_processor = post_processor
|
||||||
|
@ -304,9 +304,11 @@ impl PyStrip {
|
|||||||
/// The replacement character. Must be exactly one character. By default we
|
/// The replacement character. Must be exactly one character. By default we
|
||||||
/// use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
|
/// use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
|
||||||
///
|
///
|
||||||
/// add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
/// prepend_scheme (:obj:`str`, `optional`, defaults to :obj:`"always"`):
|
||||||
/// Whether to add a space to the first word if there isn't already one. This
|
/// Whether to add a space to the first word if there isn't already one. This
|
||||||
/// lets us treat `hello` exactly like `say hello`.
|
/// lets us treat `hello` exactly like `say hello`.
|
||||||
|
/// Choices: "always", "never", "first". First means the space is only added on the first
|
||||||
|
/// token (relevant when special tokens are used or other pre_tokenizer are used).
|
||||||
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Metaspace")]
|
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Metaspace")]
|
||||||
pub struct PyMetaspaceDec {}
|
pub struct PyMetaspaceDec {}
|
||||||
#[pymethods]
|
#[pymethods]
|
||||||
|
@ -477,9 +477,12 @@ pub(crate) fn from_string(string: String) -> Result<PrependScheme, PyErr> {
|
|||||||
/// The replacement character. Must be exactly one character. By default we
|
/// The replacement character. Must be exactly one character. By default we
|
||||||
/// use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
|
/// use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
|
||||||
///
|
///
|
||||||
/// add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
/// prepend_scheme (:obj:`str`, `optional`, defaults to :obj:`"always"`):
|
||||||
/// Whether to add a space to the first word if there isn't already one. This
|
/// Whether to add a space to the first word if there isn't already one. This
|
||||||
/// lets us treat `hello` exactly like `say hello`.
|
/// lets us treat `hello` exactly like `say hello`.
|
||||||
|
/// Choices: "always", "never", "first". First means the space is only added on the first
|
||||||
|
/// token (relevant when special tokens are used or other pre_tokenizer are used).
|
||||||
|
///
|
||||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Metaspace")]
|
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Metaspace")]
|
||||||
pub struct PyMetaspace {}
|
pub struct PyMetaspace {}
|
||||||
#[pymethods]
|
#[pymethods]
|
||||||
|
@ -109,8 +109,6 @@ class TestMetaspace:
|
|||||||
# Modify these
|
# Modify these
|
||||||
pretok.replacement = "%"
|
pretok.replacement = "%"
|
||||||
assert pretok.replacement == "%"
|
assert pretok.replacement == "%"
|
||||||
pretok.add_prefix_space = True
|
|
||||||
assert pretok.add_prefix_space == True
|
|
||||||
pretok.prepend_scheme = "first"
|
pretok.prepend_scheme = "first"
|
||||||
assert pretok.prepend_scheme == "first"
|
assert pretok.prepend_scheme == "first"
|
||||||
pretok.split = True
|
pretok.split = True
|
||||||
|
Reference in New Issue
Block a user