mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Refactor metaspace (#1476)
* version = "0.15.3-dev-0” Improve performances of meta space, but also just fix it. (transformers) ➜ transformers git:(refactor-default-llama) ✗ python ../scripts/gemma-dummy.py Token indices sequence length is longer than the specified maximum sequence length for this model (14999 > 2048). Running this sequence through the model will result in indexing errors ['<REPR_END>', '▁inform', '<s>', '.', '▁Hey', '<unk>', '.', '▁', '▁', '▁', '▁', '▁', '▁', '▁.'] ['▁inform', '<s>', '.', '▁Hey', '<unk>', '.', '▁', '▁', '▁', '▁', '▁', '▁', '▁.'] [0.0006330013275146484, 0.0014591217041015625, 0.015890836715698242, 0.18584918975830078, 2.1726326942443848] (transformers) ➜ transformers git:(refactor-default-llama) ✗ python ../scripts/gemma-dummy.py Token indices sequence length is longer than the specified maximum sequence length for this model (10000 > 2048). Running this sequence through the model will result in indexing errors ['<REPR_END>', 'in', 'form', '<s>', '.', '▁Hey', '<unk>', '.', '▁▁▁▁▁▁', '▁.'] ['in', 'form', '<s>', '.', '▁Hey', '<unk>', '.', '▁▁▁▁▁▁', '▁.'] [0.0008409023284912109, 0.0008909702301025391, 0.00882411003112793, 0.10214710235595703, 1.187899112701416] * well what do we have * nit * be BC with non legacy * unrelated change for clippy * fix test * splitting is a must for word_ids * fmt and lint * Fixing everything (hopefully better). * Fixing node. * Including yarn.lock * Lint. * Stubs. * revert to use split * fix merge issues * fix tests * finish fixing tests * ruff --------- Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
12
bindings/node/index.d.ts
vendored
12
bindings/node/index.d.ts
vendored
@ -11,7 +11,11 @@ export function ctcDecoder(
|
||||
cleanup?: boolean | undefined | null,
|
||||
): Decoder
|
||||
export function fuseDecoder(): Decoder
|
||||
export function metaspaceDecoder(replacement?: string = '▁', addPrefixSpace?: bool = true): Decoder
|
||||
export function metaspaceDecoder(
|
||||
replacement?: string = '▁',
|
||||
prependScheme?: prepend_scheme = 'always',
|
||||
split?: split = true,
|
||||
): Decoder
|
||||
export function replaceDecoder(pattern: string, content: string): Decoder
|
||||
export function sequenceDecoder(decoders: Array<Decoder>): Decoder
|
||||
export function stripDecoder(content: string, left: number, right: number): Decoder
|
||||
@ -89,7 +93,11 @@ export function byteLevelAlphabet(): Array<string>
|
||||
export function whitespacePreTokenizer(): PreTokenizer
|
||||
export function whitespaceSplitPreTokenizer(): PreTokenizer
|
||||
export function bertPreTokenizer(): PreTokenizer
|
||||
export function metaspacePreTokenizer(replacement?: string = '▁', addPrefixSpace?: bool = true): PreTokenizer
|
||||
export function metaspacePreTokenizer(
|
||||
replacement?: string = '▁',
|
||||
prependScheme?: prepend_scheme = 'always',
|
||||
split?: split = true,
|
||||
): PreTokenizer
|
||||
export function splitPreTokenizer(pattern: string, behavior: string, invert?: boolean | undefined | null): PreTokenizer
|
||||
export function punctuationPreTokenizer(behavior?: string | undefined | null): PreTokenizer
|
||||
export function sequencePreTokenizer(preTokenizers: Array<PreTokenizer>): PreTokenizer
|
||||
|
@ -219,6 +219,43 @@ switch (platform) {
|
||||
loadError = e
|
||||
}
|
||||
break
|
||||
case 'riscv64':
|
||||
if (isMusl()) {
|
||||
localFileExisted = existsSync(join(__dirname, 'tokenizers.linux-riscv64-musl.node'))
|
||||
try {
|
||||
if (localFileExisted) {
|
||||
nativeBinding = require('./tokenizers.linux-riscv64-musl.node')
|
||||
} else {
|
||||
nativeBinding = require('tokenizers-linux-riscv64-musl')
|
||||
}
|
||||
} catch (e) {
|
||||
loadError = e
|
||||
}
|
||||
} else {
|
||||
localFileExisted = existsSync(join(__dirname, 'tokenizers.linux-riscv64-gnu.node'))
|
||||
try {
|
||||
if (localFileExisted) {
|
||||
nativeBinding = require('./tokenizers.linux-riscv64-gnu.node')
|
||||
} else {
|
||||
nativeBinding = require('tokenizers-linux-riscv64-gnu')
|
||||
}
|
||||
} catch (e) {
|
||||
loadError = e
|
||||
}
|
||||
}
|
||||
break
|
||||
case 's390x':
|
||||
localFileExisted = existsSync(join(__dirname, 'tokenizers.linux-s390x-gnu.node'))
|
||||
try {
|
||||
if (localFileExisted) {
|
||||
nativeBinding = require('./tokenizers.linux-s390x-gnu.node')
|
||||
} else {
|
||||
nativeBinding = require('tokenizers-linux-s390x-gnu')
|
||||
}
|
||||
} catch (e) {
|
||||
loadError = e
|
||||
}
|
||||
break
|
||||
default:
|
||||
throw new Error(`Unsupported architecture on Linux: ${arch}`)
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "tokenizers",
|
||||
"version": "0.14.0-dev0",
|
||||
"version": "0.15.3-dev0",
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "git+https://github.com/huggingface/tokenizers.git"
|
||||
|
@ -90,9 +90,11 @@ pub fn fuse_decoder() -> Decoder {
|
||||
#[napi]
|
||||
pub fn metaspace_decoder(
|
||||
#[napi(ts_arg_type = "string = '▁'")] replacement: Option<String>,
|
||||
#[napi(ts_arg_type = "bool = true")] add_prefix_space: Option<bool>,
|
||||
#[napi(ts_arg_type = "prepend_scheme = 'always'")] prepend_scheme: Option<String>,
|
||||
#[napi(ts_arg_type = "split = true")] split: Option<bool>,
|
||||
) -> Result<Decoder> {
|
||||
let add_prefix_space = add_prefix_space.unwrap_or(true);
|
||||
use tk::pre_tokenizers::metaspace::PrependScheme;
|
||||
let split = split.unwrap_or(true);
|
||||
let replacement = replacement.unwrap_or("▁".to_string());
|
||||
if replacement.chars().count() != 1 {
|
||||
return Err(Error::from_reason(
|
||||
@ -100,9 +102,20 @@ pub fn metaspace_decoder(
|
||||
));
|
||||
}
|
||||
let replacement = replacement.chars().next().unwrap();
|
||||
let prepend_scheme: PrependScheme =
|
||||
match prepend_scheme.unwrap_or(String::from("always")).as_str() {
|
||||
"always" => PrependScheme::Always,
|
||||
"first" => PrependScheme::First,
|
||||
"never" => PrependScheme::Never,
|
||||
_ => {
|
||||
return Err(Error::from_reason(
|
||||
"prepend_scheme is supposed to be either 'always', 'first' or 'never'",
|
||||
));
|
||||
}
|
||||
};
|
||||
Ok(Decoder {
|
||||
decoder: Some(Arc::new(RwLock::new(
|
||||
tk::decoders::metaspace::Metaspace::new(replacement, add_prefix_space).into(),
|
||||
tk::decoders::metaspace::Metaspace::new(replacement, prepend_scheme, split).into(),
|
||||
))),
|
||||
})
|
||||
}
|
||||
|
@ -155,9 +155,11 @@ pub fn bert_pre_tokenizer() -> PreTokenizer {
|
||||
#[napi]
|
||||
pub fn metaspace_pre_tokenizer(
|
||||
#[napi(ts_arg_type = "string = '▁'")] replacement: Option<String>,
|
||||
#[napi(ts_arg_type = "bool = true")] add_prefix_space: Option<bool>,
|
||||
#[napi(ts_arg_type = "prepend_scheme = 'always'")] prepend_scheme: Option<String>,
|
||||
#[napi(ts_arg_type = "split = true")] split: Option<bool>,
|
||||
) -> Result<PreTokenizer> {
|
||||
let add_prefix_space = add_prefix_space.unwrap_or(true);
|
||||
use tk::pre_tokenizers::metaspace::PrependScheme;
|
||||
let split = split.unwrap_or(true);
|
||||
let replacement = replacement.unwrap_or("▁".to_string());
|
||||
if replacement.chars().count() != 1 {
|
||||
return Err(Error::from_reason(
|
||||
@ -165,10 +167,21 @@ pub fn metaspace_pre_tokenizer(
|
||||
));
|
||||
}
|
||||
let replacement = replacement.chars().next().unwrap();
|
||||
let prepend_scheme: PrependScheme =
|
||||
match prepend_scheme.unwrap_or(String::from("always")).as_str() {
|
||||
"always" => PrependScheme::Always,
|
||||
"first" => PrependScheme::First,
|
||||
"never" => PrependScheme::Never,
|
||||
_ => {
|
||||
return Err(Error::from_reason(
|
||||
"prepend_scheme is supposed to be either 'always', 'first' or 'never'",
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
Ok(PreTokenizer {
|
||||
pretok: Some(Arc::new(RwLock::new(
|
||||
tk::pre_tokenizers::metaspace::Metaspace::new(replacement, add_prefix_space).into(),
|
||||
tk::pre_tokenizers::metaspace::Metaspace::new(replacement, prepend_scheme, split).into(),
|
||||
))),
|
||||
})
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user