Refactor metaspace (#1476)

* version = "0.15.3-dev-0”

Improve performances of meta space, but also just fix it.

(transformers) ➜  transformers git:(refactor-default-llama) ✗ python ../scripts/gemma-dummy.py
Token indices sequence length is longer than the specified maximum sequence length for this model (14999 > 2048). Running this sequence through the model will result in indexing errors
['<REPR_END>', '▁inform', '<s>', '.', '▁Hey', '<unk>', '.', '▁', '▁', '▁', '▁', '▁', '▁', '▁.']
['▁inform', '<s>', '.', '▁Hey', '<unk>', '.', '▁', '▁', '▁', '▁', '▁', '▁', '▁.']
[0.0006330013275146484, 0.0014591217041015625, 0.015890836715698242, 0.18584918975830078, 2.1726326942443848]
(transformers) ➜  transformers git:(refactor-default-llama) ✗ python ../scripts/gemma-dummy.py
Token indices sequence length is longer than the specified maximum sequence length for this model (10000 > 2048). Running this sequence through the model will result in indexing errors
['<REPR_END>', 'in', 'form', '<s>', '.', '▁Hey', '<unk>', '.', '▁▁▁▁▁▁', '▁.']
['in', 'form', '<s>', '.', '▁Hey', '<unk>', '.', '▁▁▁▁▁▁', '▁.']
[0.0008409023284912109, 0.0008909702301025391, 0.00882411003112793, 0.10214710235595703, 1.187899112701416]

* well what do we have

* nit

* be BC with non legacy

* unrelated change for clippy

* fix test

* splitting is a must for word_ids

* fmt and lint

* Fixing everything (hopefully better).

* Fixing node.

* Including yarn.lock

* Lint.

* Stubs.

* revert to use split

* fix merge issues

* fix tests

* finish fixing tests

* ruff

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
Arthur
2024-03-30 10:27:24 +01:00
committed by GitHub
parent 6153126b22
commit 09069717e9
21 changed files with 1672 additions and 1515 deletions

View File

@ -9,7 +9,7 @@ check_dirs := examples py_src/tokenizers tests
style:
python stub.py
ruff check $(check_dirs) --fix
ruff format $(check_dirs)t
ruff format $(check_dirs)
# Check the source code is formatted correctly
check-style:

View File

@ -156,7 +156,7 @@ class Metaspace(Decoder):
Whether to add a space to the first word if there isn't already one. This
lets us treat `hello` exactly like `say hello`.
"""
def __init__(self, replacement="", add_prefix_space=True):
def __init__(self, replacement="", prepend_scheme="always", split=True):
pass
def decode(self, tokens):

View File

@ -32,8 +32,9 @@ class SentencePieceBPETokenizer(BaseTokenizer):
tokenizer.add_special_tokens([str(unk_token)])
tokenizer.normalizer = NFKC()
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
prepend_scheme = "always" if add_prefix_space else "never"
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
parameters = {
"model": "SentencePieceBPE",

View File

@ -29,8 +29,9 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
tokenizer.normalizer = normalizers.Sequence(
[normalizers.Nmt(), normalizers.NFKC(), normalizers.Replace(Regex(" {2,}"), " ")]
)
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
prepend_scheme = "always" if add_prefix_space else "never"
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
parameters = {
"model": "SentencePieceUnigram",

View File

@ -274,7 +274,7 @@ class Metaspace(PreTokenizer):
Whether to add a space to the first word if there isn't already one. This
lets us treat `hello` exactly like `say hello`.
"""
def __init__(self, replacement="_", add_prefix_space=True):
def __init__(self, replacement="_", prepend_scheme="always", split=True):
pass
def pre_tokenize(self, pretok):

View File

@ -1,5 +1,6 @@
use std::sync::{Arc, RwLock};
use crate::pre_tokenizers::from_string;
use crate::utils::PyChar;
use crate::utils::PyPattern;
use pyo3::exceptions;
@ -12,7 +13,7 @@ use tk::decoders::byte_fallback::ByteFallback;
use tk::decoders::byte_level::ByteLevel;
use tk::decoders::ctc::CTC;
use tk::decoders::fuse::Fuse;
use tk::decoders::metaspace::Metaspace;
use tk::decoders::metaspace::{Metaspace, PrependScheme};
use tk::decoders::sequence::Sequence;
use tk::decoders::strip::Strip;
use tk::decoders::wordpiece::WordPiece;
@ -322,22 +323,46 @@ impl PyMetaspaceDec {
}
#[getter]
fn get_add_prefix_space(self_: PyRef<Self>) -> bool {
getter!(self_, Metaspace, add_prefix_space)
fn get_split(self_: PyRef<Self>) -> bool {
getter!(self_, Metaspace, get_split())
}
#[setter]
fn set_add_prefix_space(self_: PyRef<Self>, add_prefix_space: bool) {
setter!(self_, Metaspace, add_prefix_space, add_prefix_space);
fn set_split(self_: PyRef<Self>, split: bool) {
setter!(self_, Metaspace, @set_split, split);
}
#[getter]
fn get_prepend_scheme(self_: PyRef<Self>) -> String {
// Assuming Metaspace has a method to get the prepend_scheme as a string
let scheme: PrependScheme = getter!(self_, Metaspace, get_prepend_scheme());
match scheme {
PrependScheme::First => "first",
PrependScheme::Never => "never",
PrependScheme::Always => "always",
}
.to_string()
}
#[setter]
fn set_prepend_scheme(self_: PyRef<Self>, prepend_scheme: String) -> PyResult<()> {
let scheme = from_string(prepend_scheme)?;
setter!(self_, Metaspace, @set_prepend_scheme, scheme);
Ok(())
}
#[new]
#[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true), text_signature = "(self, replacement = \"\", add_prefix_space = True)")]
fn new(replacement: PyChar, add_prefix_space: bool) -> (Self, PyDecoder) {
(
#[pyo3(signature = (replacement = PyChar('▁'), prepend_scheme = String::from("always"), split = true), text_signature = "(self, replacement = \"\", prepend_scheme = \"always\", split = True)")]
fn new(
replacement: PyChar,
prepend_scheme: String,
split: bool,
) -> PyResult<(Self, PyDecoder)> {
let prepend_scheme = from_string(prepend_scheme)?;
Ok((
PyMetaspaceDec {},
Metaspace::new(replacement.0, add_prefix_space).into(),
)
Metaspace::new(replacement.0, prepend_scheme, split).into(),
))
}
}

View File

@ -452,7 +452,7 @@ impl PySequence {
}
}
fn from_string(string: String) -> Result<PrependScheme, PyErr> {
pub(crate) fn from_string(string: String) -> Result<PrependScheme, PyErr> {
let scheme = match string.as_str() {
"first" => PrependScheme::First,
"never" => PrependScheme::Never,
@ -495,13 +495,13 @@ impl PyMetaspace {
}
#[getter]
fn get_add_prefix_space(self_: PyRef<Self>) -> bool {
getter!(self_, Metaspace, add_prefix_space)
fn get_split(self_: PyRef<Self>) -> bool {
getter!(self_, Metaspace, get_split())
}
#[setter]
fn set_add_prefix_space(self_: PyRef<Self>, add_prefix_space: bool) {
setter!(self_, Metaspace, add_prefix_space, add_prefix_space);
fn set_split(self_: PyRef<Self>, split: bool) {
setter!(self_, Metaspace, @set_split, split);
}
#[getter]
@ -524,23 +524,15 @@ impl PyMetaspace {
}
#[new]
#[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true, prepend_scheme=None, **_kwargs), text_signature = "(self, replacement=\"_\", add_prefix_space=True)")]
#[pyo3(signature = (replacement = PyChar('▁'), prepend_scheme=String::from("always"), split=true), text_signature = "(self, replacement=\"_\", prepend_scheme=\"always\", split=True)")]
fn new(
replacement: PyChar,
add_prefix_space: bool,
prepend_scheme: Option<String>,
_kwargs: Option<&PyDict>,
prepend_scheme: String,
split: bool,
) -> PyResult<(Self, PyPreTokenizer)> {
// Create a new Metaspace instance
let mut new_instance: Metaspace = Metaspace::new(replacement.0, add_prefix_space);
// If a prepend scheme is provided, set it
if let Some(prepend_scheme) = prepend_scheme {
match from_string(prepend_scheme) {
Ok(prepend_scheme_enum) => new_instance.set_prepend_scheme(prepend_scheme_enum),
Err(err) => return Err(err),
}
}
let prepend_scheme = from_string(prepend_scheme)?;
let new_instance: Metaspace = Metaspace::new(replacement.0, prepend_scheme, split);
Ok((PyMetaspace {}, new_instance.into()))
}
}

View File

@ -126,7 +126,7 @@ class TestMetaspace:
assert Metaspace(replacement="-") is not None
with pytest.raises(ValueError, match="expected a string of length 1"):
Metaspace(replacement="")
assert Metaspace(add_prefix_space=True) is not None
assert Metaspace(prepend_scheme="always") is not None
assert isinstance(Metaspace(), Decoder)
assert isinstance(Metaspace(), Metaspace)
assert isinstance(pickle.loads(pickle.dumps(Metaspace())), Metaspace)
@ -134,20 +134,20 @@ class TestMetaspace:
def test_decoding(self):
decoder = Metaspace()
assert decoder.decode(["▁My", "▁name", "▁is", "▁John"]) == "My name is John"
decoder = Metaspace(replacement="-", add_prefix_space=False)
decoder = Metaspace(replacement="-", prepend_scheme="never")
assert decoder.decode(["-My", "-name", "-is", "-John"]) == " My name is John"
def test_can_modify(self):
decoder = Metaspace(replacement="*", add_prefix_space=False)
decoder = Metaspace(replacement="*", prepend_scheme="never")
assert decoder.replacement == "*"
assert decoder.add_prefix_space == False
assert decoder.prepend_scheme == "never"
# Modify these
decoder.replacement = "&"
assert decoder.replacement == "&"
decoder.add_prefix_space = True
assert decoder.add_prefix_space == True
decoder.prepend_scheme = "first"
assert decoder.prepend_scheme == "first"
class TestBPEDecoder:

View File

@ -94,24 +94,27 @@ class TestMetaspace:
assert Metaspace(replacement="-") is not None
with pytest.raises(ValueError, match="expected a string of length 1"):
Metaspace(replacement="")
assert Metaspace(add_prefix_space=True) is not None
assert Metaspace(prepend_scheme="always") is not None
assert isinstance(Metaspace(), PreTokenizer)
assert isinstance(Metaspace(), Metaspace)
assert isinstance(pickle.loads(pickle.dumps(Metaspace())), Metaspace)
def test_can_modify(self):
pretok = Metaspace(replacement="$", add_prefix_space=False)
pretok = Metaspace(replacement="$", prepend_scheme="never")
assert pretok.replacement == "$"
assert pretok.add_prefix_space == False
assert pretok.prepend_scheme == "never"
assert pretok.split == True
# Modify these
pretok.replacement = "%"
assert pretok.replacement == "%"
pretok.add_prefix_space = True
assert pretok.add_prefix_space == True
pretok.prepend_scheme = "never"
assert pretok.prepend_scheme == "never"
pretok.prepend_scheme = "first"
assert pretok.prepend_scheme == "first"
pretok.split = True
assert pretok.split == True
class TestCharDelimiterSplit:

View File

@ -487,3 +487,51 @@ class TestTokenizer:
tokenizer.add_tokens(["of_text>"])
output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False)
assert output.tokens == ["▁Hey", "▁there", "<", "end", "_", "of_text>", "▁dear", "<eot>", "▁friend", "!"]
def test_splitting(self):
tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-new-metaspace")
tokenizer.pre_tokenizer.split = False
tokenizer.add_tokens([AddedToken("<REPR_END>", rstrip=True, lstrip=True)])
assert tokenizer.encode("<REPR_END>inform<s>. Hey. .", add_special_tokens=False).tokens == [
"<REPR_END>",
"in",
"form",
"<s>",
".",
"▁Hey",
".",
"▁▁▁▁▁▁",
"▁.",
]
assert tokenizer.encode("<REPR_END>inform<s>. Hey. .", add_special_tokens=False).ids == [
32000,
262,
689,
1,
29889,
18637,
29889,
539,
869,
]
assert tokenizer.encode("inform<s>. Hey. .").tokens == [
"<s>",
"▁inform",
"<s>",
".",
"▁Hey",
".",
"▁▁▁▁▁▁",
"▁.",
]
assert tokenizer.encode("inform<s>. Hey. .", add_special_tokens=False).tokens == [
"▁inform",
"<s>",
".",
"▁Hey",
".",
"▁▁▁▁▁▁",
"▁.",
]