Fixing doc. (#1499)

* Fixing doc.

* SentencePieceUnigram  and Convert.py still used sentencepiece

* stub

---------

Co-authored-by: Arthur Zucker <arthur.zucker@gmail.com>
This commit is contained in:
Nicolas Patry
2024-04-17 09:32:40 +02:00
committed by GitHub
parent 949d9e3e0e
commit 91393ef75e
7 changed files with 20 additions and 11 deletions

View File

@ -152,9 +152,11 @@ class Metaspace(Decoder):
The replacement character. Must be exactly one character. By default we The replacement character. Must be exactly one character. By default we
use the `▁` (U+2581) meta symbol (Same as in SentencePiece). use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`): prepend_scheme (:obj:`str`, `optional`, defaults to :obj:`"always"`):
Whether to add a space to the first word if there isn't already one. This Whether to add a space to the first word if there isn't already one. This
lets us treat `hello` exactly like `say hello`. lets us treat `hello` exactly like `say hello`.
Choices: "always", "never", "first". First means the space is only added on the first
token (relevant when special tokens are used or other pre_tokenizer are used).
""" """
def __init__(self, replacement="", prepend_scheme="always", split=True): def __init__(self, replacement="", prepend_scheme="always", split=True):
pass pass

View File

@ -183,8 +183,9 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
) )
else: else:
tokenizer.normalizer = normalizers.Sequence([normalizers.Replace(Regex(" {2,}"), " ")]) tokenizer.normalizer = normalizers.Sequence([normalizers.Replace(Regex(" {2,}"), " ")])
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space) prepend_scheme = "always" if add_prefix_space else "never"
tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space) tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
parameters = { parameters = {
"model": "SentencePieceUnigram", "model": "SentencePieceUnigram",

View File

@ -270,9 +270,12 @@ class Metaspace(PreTokenizer):
The replacement character. Must be exactly one character. By default we The replacement character. Must be exactly one character. By default we
use the `▁` (U+2581) meta symbol (Same as in SentencePiece). use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`): prepend_scheme (:obj:`str`, `optional`, defaults to :obj:`"always"`):
Whether to add a space to the first word if there isn't already one. This Whether to add a space to the first word if there isn't already one. This
lets us treat `hello` exactly like `say hello`. lets us treat `hello` exactly like `say hello`.
Choices: "always", "never", "first". First means the space is only added on the first
token (relevant when special tokens are used or other pre_tokenizer are used).
""" """
def __init__(self, replacement="_", prepend_scheme="always", split=True): def __init__(self, replacement="_", prepend_scheme="always", split=True):
pass pass

View File

@ -102,9 +102,9 @@ class SpmConverter(Converter):
tokenizer.normalizer = self.normalizer(self.proto) tokenizer.normalizer = self.normalizer(self.proto)
replacement = "" replacement = ""
add_prefix_space = True prepend_scheme = "always"
tokenizer.pre_tokenizer = Metaspace(replacement=replacement, add_prefix_space=add_prefix_space) tokenizer.pre_tokenizer = Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
post_processor = self.post_processor(tokenizer) post_processor = self.post_processor(tokenizer)
if post_processor: if post_processor:
tokenizer.post_processor = post_processor tokenizer.post_processor = post_processor

View File

@ -304,9 +304,11 @@ impl PyStrip {
/// The replacement character. Must be exactly one character. By default we /// The replacement character. Must be exactly one character. By default we
/// use the `▁` (U+2581) meta symbol (Same as in SentencePiece). /// use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
/// ///
/// add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`): /// prepend_scheme (:obj:`str`, `optional`, defaults to :obj:`"always"`):
/// Whether to add a space to the first word if there isn't already one. This /// Whether to add a space to the first word if there isn't already one. This
/// lets us treat `hello` exactly like `say hello`. /// lets us treat `hello` exactly like `say hello`.
/// Choices: "always", "never", "first". First means the space is only added on the first
/// token (relevant when special tokens are used or other pre_tokenizer are used).
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Metaspace")] #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Metaspace")]
pub struct PyMetaspaceDec {} pub struct PyMetaspaceDec {}
#[pymethods] #[pymethods]

View File

@ -477,9 +477,12 @@ pub(crate) fn from_string(string: String) -> Result<PrependScheme, PyErr> {
/// The replacement character. Must be exactly one character. By default we /// The replacement character. Must be exactly one character. By default we
/// use the `▁` (U+2581) meta symbol (Same as in SentencePiece). /// use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
/// ///
/// add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`): /// prepend_scheme (:obj:`str`, `optional`, defaults to :obj:`"always"`):
/// Whether to add a space to the first word if there isn't already one. This /// Whether to add a space to the first word if there isn't already one. This
/// lets us treat `hello` exactly like `say hello`. /// lets us treat `hello` exactly like `say hello`.
/// Choices: "always", "never", "first". First means the space is only added on the first
/// token (relevant when special tokens are used or other pre_tokenizer are used).
///
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Metaspace")] #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Metaspace")]
pub struct PyMetaspace {} pub struct PyMetaspace {}
#[pymethods] #[pymethods]

View File

@ -109,8 +109,6 @@ class TestMetaspace:
# Modify these # Modify these
pretok.replacement = "%" pretok.replacement = "%"
assert pretok.replacement == "%" assert pretok.replacement == "%"
pretok.add_prefix_space = True
assert pretok.add_prefix_space == True
pretok.prepend_scheme = "first" pretok.prepend_scheme = "first"
assert pretok.prepend_scheme == "first" assert pretok.prepend_scheme == "first"
pretok.split = True pretok.split = True