mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Perf improvement 16% by removing offsets. (#1587)
* [Breaking Change] Perf improvement 16% by removing offsets. Offsets calculation are always calculated in Python land. By changing it to not being calculated, we win 16% of the runtime. This is not the total extent of it because offsets are still calculated in bytes. * Required features. * Remove clippy error. * Make it non breaking and still show perf improvement. * Even faster without offsets. * Update doc. * Fmt. * Apply suggestions from code review Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * fmt. --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
This commit is contained in:
@ -60,7 +60,12 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
|
|||||||
mergeable_ranks=mergeable_ranks,
|
mergeable_ranks=mergeable_ranks,
|
||||||
special_tokens=special_tokens,
|
special_tokens=special_tokens,
|
||||||
)
|
)
|
||||||
enc.encode("warmup")
|
out = enc.encode("This is a test")
|
||||||
|
|
||||||
|
hf_enc = Tokenizer.from_pretrained(model)
|
||||||
|
out2 = hf_enc.encode("This is a test", add_special_tokens=False).ids
|
||||||
|
|
||||||
|
assert out == out2, "sanity check"
|
||||||
|
|
||||||
start = time.perf_counter_ns()
|
start = time.perf_counter_ns()
|
||||||
enc.encode_ordinary_batch(documents, num_threads=num_threads)
|
enc.encode_ordinary_batch(documents, num_threads=num_threads)
|
||||||
@ -69,11 +74,9 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
|
|||||||
readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9)
|
readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9)
|
||||||
print(f"tiktoken \t{readable_size} / s")
|
print(f"tiktoken \t{readable_size} / s")
|
||||||
|
|
||||||
hf_enc = Tokenizer.from_pretrained(model)
|
|
||||||
hf_enc.encode("warmup")
|
|
||||||
|
|
||||||
start = time.perf_counter_ns()
|
start = time.perf_counter_ns()
|
||||||
hf_enc.encode_batch(documents)
|
hf_enc.encode_batch_fast(documents)
|
||||||
end = time.perf_counter_ns()
|
end = time.perf_counter_ns()
|
||||||
readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9)
|
readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9)
|
||||||
print(f"huggingface \t{readable_size} / s")
|
print(f"huggingface \t{readable_size} / s")
|
||||||
@ -82,8 +85,7 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
|
|||||||
def test(model: str, dataset: str, dataset_config: str, threads: List[int]):
|
def test(model: str, dataset: str, dataset_config: str, threads: List[int]):
|
||||||
dataset_xnli = load_dataset(dataset, dataset_config)
|
dataset_xnli = load_dataset(dataset, dataset_config)
|
||||||
|
|
||||||
# input_lengths = [(10, False), (10_000, False), (10_000, True)] # Example input lengths
|
input_lengths = [(10, False, True), (10_000, False, True), (10_000, False, False)]
|
||||||
input_lengths = [(10_000, False, True), (10_000, False, False)]
|
|
||||||
|
|
||||||
for num_threads in threads:
|
for num_threads in threads:
|
||||||
for length, fuse, long in input_lengths:
|
for length, fuse, long in input_lengths:
|
||||||
|
@ -892,6 +892,42 @@ class Tokenizer:
|
|||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def encode_batch_fast(self, input, is_pretokenized=False, add_special_tokens=True):
|
||||||
|
"""
|
||||||
|
Encode the given batch of inputs. This method is faster than `encode_batch`
|
||||||
|
because it doesn't keep track of offsets, they will be all zeros.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
Here are some examples of the inputs that are accepted::
|
||||||
|
|
||||||
|
encode_batch_fast([
|
||||||
|
"A single sequence",
|
||||||
|
("A tuple with a sequence", "And its pair"),
|
||||||
|
[ "A", "pre", "tokenized", "sequence" ],
|
||||||
|
([ "A", "pre", "tokenized", "sequence" ], "And its pair")
|
||||||
|
])
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
|
||||||
|
A list of single sequences or pair sequences to encode. Each sequence
|
||||||
|
can be either raw text or pre-tokenized, according to the ``is_pretokenized``
|
||||||
|
argument:
|
||||||
|
|
||||||
|
- If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
|
||||||
|
- If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
|
||||||
|
|
||||||
|
is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
|
||||||
|
Whether the input is already pre-tokenized
|
||||||
|
|
||||||
|
add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
|
||||||
|
Whether to add the special tokens
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
|
||||||
|
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def encode_special_tokens(self):
|
def encode_special_tokens(self):
|
||||||
"""
|
"""
|
||||||
|
@ -1055,6 +1055,67 @@ impl PyTokenizer {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Encode the given batch of inputs. This method is faster than `encode_batch`
|
||||||
|
/// because it doesn't keep track of offsets, they will be all zeros.
|
||||||
|
///
|
||||||
|
/// Example:
|
||||||
|
/// Here are some examples of the inputs that are accepted::
|
||||||
|
///
|
||||||
|
/// encode_batch_fast([
|
||||||
|
/// "A single sequence",
|
||||||
|
/// ("A tuple with a sequence", "And its pair"),
|
||||||
|
/// [ "A", "pre", "tokenized", "sequence" ],
|
||||||
|
/// ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
|
||||||
|
/// ])
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
|
||||||
|
/// A list of single sequences or pair sequences to encode. Each sequence
|
||||||
|
/// can be either raw text or pre-tokenized, according to the ``is_pretokenized``
|
||||||
|
/// argument:
|
||||||
|
///
|
||||||
|
/// - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
|
||||||
|
/// - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
|
||||||
|
///
|
||||||
|
/// is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
|
||||||
|
/// Whether the input is already pre-tokenized
|
||||||
|
///
|
||||||
|
/// add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
|
||||||
|
/// Whether to add the special tokens
|
||||||
|
///
|
||||||
|
/// Returns:
|
||||||
|
/// A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
|
||||||
|
///
|
||||||
|
#[pyo3(signature = (input, is_pretokenized = false, add_special_tokens = true))]
|
||||||
|
#[pyo3(text_signature = "(self, input, is_pretokenized=False, add_special_tokens=True)")]
|
||||||
|
fn encode_batch_fast(
|
||||||
|
&self,
|
||||||
|
py: Python<'_>,
|
||||||
|
input: Vec<&PyAny>,
|
||||||
|
is_pretokenized: bool,
|
||||||
|
add_special_tokens: bool,
|
||||||
|
) -> PyResult<Vec<PyEncoding>> {
|
||||||
|
let input: Vec<tk::EncodeInput> = input
|
||||||
|
.into_iter()
|
||||||
|
.map(|o| {
|
||||||
|
let input: tk::EncodeInput = if is_pretokenized {
|
||||||
|
o.extract::<PreTokenizedEncodeInput>()?.into()
|
||||||
|
} else {
|
||||||
|
o.extract::<TextEncodeInput>()?.into()
|
||||||
|
};
|
||||||
|
Ok(input)
|
||||||
|
})
|
||||||
|
.collect::<PyResult<Vec<tk::EncodeInput>>>()?;
|
||||||
|
py.allow_threads(|| {
|
||||||
|
ToPyResult(
|
||||||
|
self.tokenizer
|
||||||
|
.encode_batch_fast(input, add_special_tokens)
|
||||||
|
.map(|encodings| encodings.into_iter().map(|e| e.into()).collect()),
|
||||||
|
)
|
||||||
|
.into()
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
/// Decode the given list of ids back to a string
|
/// Decode the given list of ids back to a string
|
||||||
///
|
///
|
||||||
/// This is used to decode anything coming back from a Language Model
|
/// This is used to decode anything coming back from a Language Model
|
||||||
|
@ -36,6 +36,11 @@ harness = false
|
|||||||
name = "unigram_benchmark"
|
name = "unigram_benchmark"
|
||||||
harness = false
|
harness = false
|
||||||
|
|
||||||
|
[[bench]]
|
||||||
|
name = "llama3"
|
||||||
|
required-features = ["http"]
|
||||||
|
harness = false
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
lazy_static = "1.4"
|
lazy_static = "1.4"
|
||||||
rand = "0.8"
|
rand = "0.8"
|
||||||
@ -80,3 +85,8 @@ tracing-subscriber = "0.3.18"
|
|||||||
|
|
||||||
[profile.release]
|
[profile.release]
|
||||||
lto = "fat"
|
lto = "fat"
|
||||||
|
|
||||||
|
[[example]]
|
||||||
|
name = "encode_batch"
|
||||||
|
required-features = ["http"]
|
||||||
|
|
||||||
|
42
tokenizers/benches/llama3.rs
Normal file
42
tokenizers/benches/llama3.rs
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
#[macro_use]
|
||||||
|
extern crate criterion;
|
||||||
|
|
||||||
|
use criterion::{Criterion, Throughput};
|
||||||
|
use tokenizers::Tokenizer;
|
||||||
|
|
||||||
|
pub fn llama3(c: &mut Criterion) {
|
||||||
|
let data = std::fs::read_to_string("data/big.txt").unwrap();
|
||||||
|
let mut group = c.benchmark_group("llama3-encode");
|
||||||
|
group.throughput(Throughput::Bytes(data.bytes().len() as u64));
|
||||||
|
group.bench_function("llama3-offsets", |b| {
|
||||||
|
let tokenizer =
|
||||||
|
Tokenizer::from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", None).unwrap();
|
||||||
|
let data: Vec<_> = data.lines().collect();
|
||||||
|
let add_special_tokens = false;
|
||||||
|
b.iter(|| {
|
||||||
|
tokenizer
|
||||||
|
.encode_batch_char_offsets(criterion::black_box(data.clone()), add_special_tokens)
|
||||||
|
.unwrap()
|
||||||
|
})
|
||||||
|
});
|
||||||
|
group.bench_function("llama3-nooffsets", |b| {
|
||||||
|
let tokenizer =
|
||||||
|
Tokenizer::from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", None).unwrap();
|
||||||
|
let data: Vec<_> = data.lines().collect();
|
||||||
|
let add_special_tokens = false;
|
||||||
|
b.iter(|| {
|
||||||
|
tokenizer
|
||||||
|
.encode_batch(criterion::black_box(data.clone()), add_special_tokens)
|
||||||
|
.unwrap()
|
||||||
|
})
|
||||||
|
});
|
||||||
|
group.finish();
|
||||||
|
}
|
||||||
|
|
||||||
|
criterion_group! {
|
||||||
|
name = bert_benches;
|
||||||
|
config = Criterion::default().sample_size(10);
|
||||||
|
targets = llama3
|
||||||
|
}
|
||||||
|
|
||||||
|
criterion_main!(bert_benches);
|
11
tokenizers/examples/encode_batch.rs
Normal file
11
tokenizers/examples/encode_batch.rs
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
use tokenizers::Tokenizer;
|
||||||
|
|
||||||
|
fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
||||||
|
let tokenizer = Tokenizer::from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", None)?;
|
||||||
|
|
||||||
|
let data = std::fs::read_to_string("data/big.txt")?;
|
||||||
|
let data: Vec<_> = data.lines().collect();
|
||||||
|
let add_special_tokens = false;
|
||||||
|
tokenizer.encode_batch_char_offsets(data, add_special_tokens)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
@ -759,6 +759,48 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Encode the given input. This method accepts both single sequences, as well as pair
|
||||||
|
/// sequences. Also, a sequence can be a string, or already pre-tokenized input directly:
|
||||||
|
/// Contrarily to `encode`, it does not compute offsets
|
||||||
|
/// ```
|
||||||
|
/// # use tokenizers::Tokenizer;
|
||||||
|
/// # use tokenizers::models::bpe::BPE;
|
||||||
|
/// # let mut tokenizer = Tokenizer::new(BPE::default());
|
||||||
|
/// #
|
||||||
|
/// // Sequences:
|
||||||
|
/// tokenizer.encode_fast("Single sequence", false);
|
||||||
|
/// tokenizer.encode_fast(("Sequence A", "Sequence B"), false);
|
||||||
|
///
|
||||||
|
/// // Pre-tokenized sequences:
|
||||||
|
/// tokenizer.encode_fast(&["Single", "sequence"][..], false);
|
||||||
|
/// tokenizer.encode_fast((
|
||||||
|
/// &["Sequence", "A"][..],
|
||||||
|
/// &["Sequence", "B"][..]
|
||||||
|
/// ), false);
|
||||||
|
///
|
||||||
|
/// // or even both types together:
|
||||||
|
/// tokenizer.encode_fast(("A complete sequence", &["And", "a", "tokenized"][..]), false);
|
||||||
|
/// ```
|
||||||
|
pub fn encode_fast<'s, E>(&self, input: E, add_special_tokens: bool) -> Result<Encoding>
|
||||||
|
where
|
||||||
|
E: Into<EncodeInput<'s>>,
|
||||||
|
{
|
||||||
|
// Extract sequences from the EncodeInput
|
||||||
|
let (sequence, pair) = match input.into() {
|
||||||
|
EncodeInput::Single(s1) => (s1, None),
|
||||||
|
EncodeInput::Dual(s1, s2) => (s1, Some(s2)),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Encode each sequence
|
||||||
|
let encoding = self.encode_single_sequence(sequence, 0, OffsetType::None)?;
|
||||||
|
let pair_encoding = pair
|
||||||
|
.map(|sequence| self.encode_single_sequence(sequence, 1, OffsetType::None))
|
||||||
|
.transpose()?;
|
||||||
|
|
||||||
|
// And finally post process
|
||||||
|
self.post_process(encoding, pair_encoding, add_special_tokens)
|
||||||
|
}
|
||||||
|
|
||||||
/// Encode the given input. This method accepts both single sequences, as well as pair
|
/// Encode the given input. This method accepts both single sequences, as well as pair
|
||||||
/// sequences. Also, a sequence can be a string, or already pre-tokenized input directly:
|
/// sequences. Also, a sequence can be a string, or already pre-tokenized input directly:
|
||||||
///
|
///
|
||||||
@ -1059,6 +1101,28 @@ where
|
|||||||
Ok(encodings)
|
Ok(encodings)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Encode all the sentences in parallel, using multiple threads
|
||||||
|
pub fn encode_batch_fast<'s, E>(
|
||||||
|
&self,
|
||||||
|
inputs: Vec<E>,
|
||||||
|
add_special_tokens: bool,
|
||||||
|
) -> Result<Vec<Encoding>>
|
||||||
|
where
|
||||||
|
E: Into<EncodeInput<'s>> + Send,
|
||||||
|
{
|
||||||
|
let mut encodings = inputs
|
||||||
|
.into_maybe_par_iter()
|
||||||
|
.map(|input| self.encode_fast(input, add_special_tokens))
|
||||||
|
.collect::<Result<Vec<Encoding>>>()?;
|
||||||
|
|
||||||
|
if let Some(params) = &self.padding {
|
||||||
|
// We do the padding here to make sure we handle the batch padding
|
||||||
|
pad_encodings(&mut encodings, params)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(encodings)
|
||||||
|
}
|
||||||
|
|
||||||
/// Decode all sentences in parallel
|
/// Decode all sentences in parallel
|
||||||
pub fn decode_batch(
|
pub fn decode_batch(
|
||||||
&self,
|
&self,
|
||||||
|
@ -8,6 +8,7 @@ use std::collections::HashMap;
|
|||||||
pub enum OffsetType {
|
pub enum OffsetType {
|
||||||
Byte,
|
Byte,
|
||||||
Char,
|
Char,
|
||||||
|
None,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Wrapper for a subpart of a `NormalizedString`.
|
/// Wrapper for a subpart of a `NormalizedString`.
|
||||||
@ -146,6 +147,19 @@ impl PreTokenizedString {
|
|||||||
let offset_converter = match offset_type {
|
let offset_converter = match offset_type {
|
||||||
OffsetType::Char => Some(BytesToCharOffsetConverter::new(&self.original)),
|
OffsetType::Char => Some(BytesToCharOffsetConverter::new(&self.original)),
|
||||||
OffsetType::Byte => None,
|
OffsetType::Byte => None,
|
||||||
|
OffsetType::None => {
|
||||||
|
let tokens = self
|
||||||
|
.splits
|
||||||
|
.into_iter()
|
||||||
|
.flat_map(|split| {
|
||||||
|
split.tokens.unwrap().into_iter().map(|token| {
|
||||||
|
// Replace this with the actual fields you need for the Encoding type
|
||||||
|
(token.id, String::with_capacity(0), (0, 0), None, 0)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
return Ok(tokens);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(self
|
Ok(self
|
||||||
@ -197,6 +211,7 @@ impl PreTokenizedString {
|
|||||||
let offset_converter = match offset_type {
|
let offset_converter = match offset_type {
|
||||||
OffsetType::Char => Some(BytesToCharOffsetConverter::new(&self.original)),
|
OffsetType::Char => Some(BytesToCharOffsetConverter::new(&self.original)),
|
||||||
OffsetType::Byte => None,
|
OffsetType::Byte => None,
|
||||||
|
OffsetType::None => None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut offset = 0;
|
let mut offset = 0;
|
||||||
|
Reference in New Issue
Block a user