mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Perf improvement 16% by removing offsets. (#1587)
* [Breaking Change] Perf improvement 16% by removing offsets. Offsets calculation are always calculated in Python land. By changing it to not being calculated, we win 16% of the runtime. This is not the total extent of it because offsets are still calculated in bytes. * Required features. * Remove clippy error. * Make it non breaking and still show perf improvement. * Even faster without offsets. * Update doc. * Fmt. * Apply suggestions from code review Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * fmt. --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
This commit is contained in:
@ -60,7 +60,12 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
|
||||
mergeable_ranks=mergeable_ranks,
|
||||
special_tokens=special_tokens,
|
||||
)
|
||||
enc.encode("warmup")
|
||||
out = enc.encode("This is a test")
|
||||
|
||||
hf_enc = Tokenizer.from_pretrained(model)
|
||||
out2 = hf_enc.encode("This is a test", add_special_tokens=False).ids
|
||||
|
||||
assert out == out2, "sanity check"
|
||||
|
||||
start = time.perf_counter_ns()
|
||||
enc.encode_ordinary_batch(documents, num_threads=num_threads)
|
||||
@ -69,11 +74,9 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
|
||||
readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9)
|
||||
print(f"tiktoken \t{readable_size} / s")
|
||||
|
||||
hf_enc = Tokenizer.from_pretrained(model)
|
||||
hf_enc.encode("warmup")
|
||||
|
||||
start = time.perf_counter_ns()
|
||||
hf_enc.encode_batch(documents)
|
||||
hf_enc.encode_batch_fast(documents)
|
||||
end = time.perf_counter_ns()
|
||||
readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9)
|
||||
print(f"huggingface \t{readable_size} / s")
|
||||
@ -82,8 +85,7 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
|
||||
def test(model: str, dataset: str, dataset_config: str, threads: List[int]):
|
||||
dataset_xnli = load_dataset(dataset, dataset_config)
|
||||
|
||||
# input_lengths = [(10, False), (10_000, False), (10_000, True)] # Example input lengths
|
||||
input_lengths = [(10_000, False, True), (10_000, False, False)]
|
||||
input_lengths = [(10, False, True), (10_000, False, True), (10_000, False, False)]
|
||||
|
||||
for num_threads in threads:
|
||||
for length, fuse, long in input_lengths:
|
||||
|
@ -892,6 +892,42 @@ class Tokenizer:
|
||||
"""
|
||||
pass
|
||||
|
||||
def encode_batch_fast(self, input, is_pretokenized=False, add_special_tokens=True):
|
||||
"""
|
||||
Encode the given batch of inputs. This method is faster than `encode_batch`
|
||||
because it doesn't keep track of offsets, they will be all zeros.
|
||||
|
||||
Example:
|
||||
Here are some examples of the inputs that are accepted::
|
||||
|
||||
encode_batch_fast([
|
||||
"A single sequence",
|
||||
("A tuple with a sequence", "And its pair"),
|
||||
[ "A", "pre", "tokenized", "sequence" ],
|
||||
([ "A", "pre", "tokenized", "sequence" ], "And its pair")
|
||||
])
|
||||
|
||||
Args:
|
||||
input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
|
||||
A list of single sequences or pair sequences to encode. Each sequence
|
||||
can be either raw text or pre-tokenized, according to the ``is_pretokenized``
|
||||
argument:
|
||||
|
||||
- If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
|
||||
- If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
|
||||
|
||||
is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
|
||||
Whether the input is already pre-tokenized
|
||||
|
||||
add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
|
||||
Whether to add the special tokens
|
||||
|
||||
Returns:
|
||||
A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
|
||||
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def encode_special_tokens(self):
|
||||
"""
|
||||
|
@ -1055,6 +1055,67 @@ impl PyTokenizer {
|
||||
})
|
||||
}
|
||||
|
||||
/// Encode the given batch of inputs. This method is faster than `encode_batch`
|
||||
/// because it doesn't keep track of offsets, they will be all zeros.
|
||||
///
|
||||
/// Example:
|
||||
/// Here are some examples of the inputs that are accepted::
|
||||
///
|
||||
/// encode_batch_fast([
|
||||
/// "A single sequence",
|
||||
/// ("A tuple with a sequence", "And its pair"),
|
||||
/// [ "A", "pre", "tokenized", "sequence" ],
|
||||
/// ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
|
||||
/// ])
|
||||
///
|
||||
/// Args:
|
||||
/// input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
|
||||
/// A list of single sequences or pair sequences to encode. Each sequence
|
||||
/// can be either raw text or pre-tokenized, according to the ``is_pretokenized``
|
||||
/// argument:
|
||||
///
|
||||
/// - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
|
||||
/// - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
|
||||
///
|
||||
/// is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
|
||||
/// Whether the input is already pre-tokenized
|
||||
///
|
||||
/// add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
|
||||
/// Whether to add the special tokens
|
||||
///
|
||||
/// Returns:
|
||||
/// A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
|
||||
///
|
||||
#[pyo3(signature = (input, is_pretokenized = false, add_special_tokens = true))]
|
||||
#[pyo3(text_signature = "(self, input, is_pretokenized=False, add_special_tokens=True)")]
|
||||
fn encode_batch_fast(
|
||||
&self,
|
||||
py: Python<'_>,
|
||||
input: Vec<&PyAny>,
|
||||
is_pretokenized: bool,
|
||||
add_special_tokens: bool,
|
||||
) -> PyResult<Vec<PyEncoding>> {
|
||||
let input: Vec<tk::EncodeInput> = input
|
||||
.into_iter()
|
||||
.map(|o| {
|
||||
let input: tk::EncodeInput = if is_pretokenized {
|
||||
o.extract::<PreTokenizedEncodeInput>()?.into()
|
||||
} else {
|
||||
o.extract::<TextEncodeInput>()?.into()
|
||||
};
|
||||
Ok(input)
|
||||
})
|
||||
.collect::<PyResult<Vec<tk::EncodeInput>>>()?;
|
||||
py.allow_threads(|| {
|
||||
ToPyResult(
|
||||
self.tokenizer
|
||||
.encode_batch_fast(input, add_special_tokens)
|
||||
.map(|encodings| encodings.into_iter().map(|e| e.into()).collect()),
|
||||
)
|
||||
.into()
|
||||
})
|
||||
}
|
||||
|
||||
/// Decode the given list of ids back to a string
|
||||
///
|
||||
/// This is used to decode anything coming back from a Language Model
|
||||
|
@ -36,6 +36,11 @@ harness = false
|
||||
name = "unigram_benchmark"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "llama3"
|
||||
required-features = ["http"]
|
||||
harness = false
|
||||
|
||||
[dependencies]
|
||||
lazy_static = "1.4"
|
||||
rand = "0.8"
|
||||
@ -80,3 +85,8 @@ tracing-subscriber = "0.3.18"
|
||||
|
||||
[profile.release]
|
||||
lto = "fat"
|
||||
|
||||
[[example]]
|
||||
name = "encode_batch"
|
||||
required-features = ["http"]
|
||||
|
||||
|
42
tokenizers/benches/llama3.rs
Normal file
42
tokenizers/benches/llama3.rs
Normal file
@ -0,0 +1,42 @@
|
||||
#[macro_use]
|
||||
extern crate criterion;
|
||||
|
||||
use criterion::{Criterion, Throughput};
|
||||
use tokenizers::Tokenizer;
|
||||
|
||||
pub fn llama3(c: &mut Criterion) {
|
||||
let data = std::fs::read_to_string("data/big.txt").unwrap();
|
||||
let mut group = c.benchmark_group("llama3-encode");
|
||||
group.throughput(Throughput::Bytes(data.bytes().len() as u64));
|
||||
group.bench_function("llama3-offsets", |b| {
|
||||
let tokenizer =
|
||||
Tokenizer::from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", None).unwrap();
|
||||
let data: Vec<_> = data.lines().collect();
|
||||
let add_special_tokens = false;
|
||||
b.iter(|| {
|
||||
tokenizer
|
||||
.encode_batch_char_offsets(criterion::black_box(data.clone()), add_special_tokens)
|
||||
.unwrap()
|
||||
})
|
||||
});
|
||||
group.bench_function("llama3-nooffsets", |b| {
|
||||
let tokenizer =
|
||||
Tokenizer::from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", None).unwrap();
|
||||
let data: Vec<_> = data.lines().collect();
|
||||
let add_special_tokens = false;
|
||||
b.iter(|| {
|
||||
tokenizer
|
||||
.encode_batch(criterion::black_box(data.clone()), add_special_tokens)
|
||||
.unwrap()
|
||||
})
|
||||
});
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group! {
|
||||
name = bert_benches;
|
||||
config = Criterion::default().sample_size(10);
|
||||
targets = llama3
|
||||
}
|
||||
|
||||
criterion_main!(bert_benches);
|
11
tokenizers/examples/encode_batch.rs
Normal file
11
tokenizers/examples/encode_batch.rs
Normal file
@ -0,0 +1,11 @@
|
||||
use tokenizers::Tokenizer;
|
||||
|
||||
fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
||||
let tokenizer = Tokenizer::from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", None)?;
|
||||
|
||||
let data = std::fs::read_to_string("data/big.txt")?;
|
||||
let data: Vec<_> = data.lines().collect();
|
||||
let add_special_tokens = false;
|
||||
tokenizer.encode_batch_char_offsets(data, add_special_tokens)?;
|
||||
Ok(())
|
||||
}
|
@ -759,6 +759,48 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
/// Encode the given input. This method accepts both single sequences, as well as pair
|
||||
/// sequences. Also, a sequence can be a string, or already pre-tokenized input directly:
|
||||
/// Contrarily to `encode`, it does not compute offsets
|
||||
/// ```
|
||||
/// # use tokenizers::Tokenizer;
|
||||
/// # use tokenizers::models::bpe::BPE;
|
||||
/// # let mut tokenizer = Tokenizer::new(BPE::default());
|
||||
/// #
|
||||
/// // Sequences:
|
||||
/// tokenizer.encode_fast("Single sequence", false);
|
||||
/// tokenizer.encode_fast(("Sequence A", "Sequence B"), false);
|
||||
///
|
||||
/// // Pre-tokenized sequences:
|
||||
/// tokenizer.encode_fast(&["Single", "sequence"][..], false);
|
||||
/// tokenizer.encode_fast((
|
||||
/// &["Sequence", "A"][..],
|
||||
/// &["Sequence", "B"][..]
|
||||
/// ), false);
|
||||
///
|
||||
/// // or even both types together:
|
||||
/// tokenizer.encode_fast(("A complete sequence", &["And", "a", "tokenized"][..]), false);
|
||||
/// ```
|
||||
pub fn encode_fast<'s, E>(&self, input: E, add_special_tokens: bool) -> Result<Encoding>
|
||||
where
|
||||
E: Into<EncodeInput<'s>>,
|
||||
{
|
||||
// Extract sequences from the EncodeInput
|
||||
let (sequence, pair) = match input.into() {
|
||||
EncodeInput::Single(s1) => (s1, None),
|
||||
EncodeInput::Dual(s1, s2) => (s1, Some(s2)),
|
||||
};
|
||||
|
||||
// Encode each sequence
|
||||
let encoding = self.encode_single_sequence(sequence, 0, OffsetType::None)?;
|
||||
let pair_encoding = pair
|
||||
.map(|sequence| self.encode_single_sequence(sequence, 1, OffsetType::None))
|
||||
.transpose()?;
|
||||
|
||||
// And finally post process
|
||||
self.post_process(encoding, pair_encoding, add_special_tokens)
|
||||
}
|
||||
|
||||
/// Encode the given input. This method accepts both single sequences, as well as pair
|
||||
/// sequences. Also, a sequence can be a string, or already pre-tokenized input directly:
|
||||
///
|
||||
@ -1059,6 +1101,28 @@ where
|
||||
Ok(encodings)
|
||||
}
|
||||
|
||||
/// Encode all the sentences in parallel, using multiple threads
|
||||
pub fn encode_batch_fast<'s, E>(
|
||||
&self,
|
||||
inputs: Vec<E>,
|
||||
add_special_tokens: bool,
|
||||
) -> Result<Vec<Encoding>>
|
||||
where
|
||||
E: Into<EncodeInput<'s>> + Send,
|
||||
{
|
||||
let mut encodings = inputs
|
||||
.into_maybe_par_iter()
|
||||
.map(|input| self.encode_fast(input, add_special_tokens))
|
||||
.collect::<Result<Vec<Encoding>>>()?;
|
||||
|
||||
if let Some(params) = &self.padding {
|
||||
// We do the padding here to make sure we handle the batch padding
|
||||
pad_encodings(&mut encodings, params)?;
|
||||
}
|
||||
|
||||
Ok(encodings)
|
||||
}
|
||||
|
||||
/// Decode all sentences in parallel
|
||||
pub fn decode_batch(
|
||||
&self,
|
||||
|
@ -8,6 +8,7 @@ use std::collections::HashMap;
|
||||
pub enum OffsetType {
|
||||
Byte,
|
||||
Char,
|
||||
None,
|
||||
}
|
||||
|
||||
/// Wrapper for a subpart of a `NormalizedString`.
|
||||
@ -146,6 +147,19 @@ impl PreTokenizedString {
|
||||
let offset_converter = match offset_type {
|
||||
OffsetType::Char => Some(BytesToCharOffsetConverter::new(&self.original)),
|
||||
OffsetType::Byte => None,
|
||||
OffsetType::None => {
|
||||
let tokens = self
|
||||
.splits
|
||||
.into_iter()
|
||||
.flat_map(|split| {
|
||||
split.tokens.unwrap().into_iter().map(|token| {
|
||||
// Replace this with the actual fields you need for the Encoding type
|
||||
(token.id, String::with_capacity(0), (0, 0), None, 0)
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
return Ok(tokens);
|
||||
}
|
||||
};
|
||||
|
||||
Ok(self
|
||||
@ -197,6 +211,7 @@ impl PreTokenizedString {
|
||||
let offset_converter = match offset_type {
|
||||
OffsetType::Char => Some(BytesToCharOffsetConverter::new(&self.original)),
|
||||
OffsetType::Byte => None,
|
||||
OffsetType::None => None,
|
||||
};
|
||||
|
||||
let mut offset = 0;
|
||||
|
Reference in New Issue
Block a user