Perf improvement 16% by removing offsets. (#1587)

* [Breaking Change] Perf improvement 16% by removing offsets.

Offsets calculation are always calculated in Python land.
By changing it to not being calculated, we win 16% of the runtime.

This is not the total extent of it because offsets are
still calculated in bytes.

* Required features.

* Remove clippy error.

* Make it non breaking and still show perf improvement.

* Even faster without offsets.

* Update doc.

* Fmt.

* Apply suggestions from code review

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* fmt.

---------

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
This commit is contained in:
Nicolas Patry
2024-08-08 14:56:13 +02:00
committed by GitHub
parent bd27fa56d6
commit bfd9cdeefb
8 changed files with 247 additions and 6 deletions

View File

@ -60,7 +60,12 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
mergeable_ranks=mergeable_ranks, mergeable_ranks=mergeable_ranks,
special_tokens=special_tokens, special_tokens=special_tokens,
) )
enc.encode("warmup") out = enc.encode("This is a test")
hf_enc = Tokenizer.from_pretrained(model)
out2 = hf_enc.encode("This is a test", add_special_tokens=False).ids
assert out == out2, "sanity check"
start = time.perf_counter_ns() start = time.perf_counter_ns()
enc.encode_ordinary_batch(documents, num_threads=num_threads) enc.encode_ordinary_batch(documents, num_threads=num_threads)
@ -69,11 +74,9 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9) readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9)
print(f"tiktoken \t{readable_size} / s") print(f"tiktoken \t{readable_size} / s")
hf_enc = Tokenizer.from_pretrained(model)
hf_enc.encode("warmup")
start = time.perf_counter_ns() start = time.perf_counter_ns()
hf_enc.encode_batch(documents) hf_enc.encode_batch_fast(documents)
end = time.perf_counter_ns() end = time.perf_counter_ns()
readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9) readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9)
print(f"huggingface \t{readable_size} / s") print(f"huggingface \t{readable_size} / s")
@ -82,8 +85,7 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
def test(model: str, dataset: str, dataset_config: str, threads: List[int]): def test(model: str, dataset: str, dataset_config: str, threads: List[int]):
dataset_xnli = load_dataset(dataset, dataset_config) dataset_xnli = load_dataset(dataset, dataset_config)
# input_lengths = [(10, False), (10_000, False), (10_000, True)] # Example input lengths input_lengths = [(10, False, True), (10_000, False, True), (10_000, False, False)]
input_lengths = [(10_000, False, True), (10_000, False, False)]
for num_threads in threads: for num_threads in threads:
for length, fuse, long in input_lengths: for length, fuse, long in input_lengths:

View File

@ -892,6 +892,42 @@ class Tokenizer:
""" """
pass pass
def encode_batch_fast(self, input, is_pretokenized=False, add_special_tokens=True):
"""
Encode the given batch of inputs. This method is faster than `encode_batch`
because it doesn't keep track of offsets, they will be all zeros.
Example:
Here are some examples of the inputs that are accepted::
encode_batch_fast([
"A single sequence",
("A tuple with a sequence", "And its pair"),
[ "A", "pre", "tokenized", "sequence" ],
([ "A", "pre", "tokenized", "sequence" ], "And its pair")
])
Args:
input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
A list of single sequences or pair sequences to encode. Each sequence
can be either raw text or pre-tokenized, according to the ``is_pretokenized``
argument:
- If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
- If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
Whether the input is already pre-tokenized
add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
Whether to add the special tokens
Returns:
A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
"""
pass
@property @property
def encode_special_tokens(self): def encode_special_tokens(self):
""" """

View File

@ -1055,6 +1055,67 @@ impl PyTokenizer {
}) })
} }
/// Encode the given batch of inputs. This method is faster than `encode_batch`
/// because it doesn't keep track of offsets, they will be all zeros.
///
/// Example:
/// Here are some examples of the inputs that are accepted::
///
/// encode_batch_fast([
/// "A single sequence",
/// ("A tuple with a sequence", "And its pair"),
/// [ "A", "pre", "tokenized", "sequence" ],
/// ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
/// ])
///
/// Args:
/// input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
/// A list of single sequences or pair sequences to encode. Each sequence
/// can be either raw text or pre-tokenized, according to the ``is_pretokenized``
/// argument:
///
/// - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
/// - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
///
/// is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
/// Whether the input is already pre-tokenized
///
/// add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
/// Whether to add the special tokens
///
/// Returns:
/// A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
///
#[pyo3(signature = (input, is_pretokenized = false, add_special_tokens = true))]
#[pyo3(text_signature = "(self, input, is_pretokenized=False, add_special_tokens=True)")]
fn encode_batch_fast(
&self,
py: Python<'_>,
input: Vec<&PyAny>,
is_pretokenized: bool,
add_special_tokens: bool,
) -> PyResult<Vec<PyEncoding>> {
let input: Vec<tk::EncodeInput> = input
.into_iter()
.map(|o| {
let input: tk::EncodeInput = if is_pretokenized {
o.extract::<PreTokenizedEncodeInput>()?.into()
} else {
o.extract::<TextEncodeInput>()?.into()
};
Ok(input)
})
.collect::<PyResult<Vec<tk::EncodeInput>>>()?;
py.allow_threads(|| {
ToPyResult(
self.tokenizer
.encode_batch_fast(input, add_special_tokens)
.map(|encodings| encodings.into_iter().map(|e| e.into()).collect()),
)
.into()
})
}
/// Decode the given list of ids back to a string /// Decode the given list of ids back to a string
/// ///
/// This is used to decode anything coming back from a Language Model /// This is used to decode anything coming back from a Language Model

View File

@ -36,6 +36,11 @@ harness = false
name = "unigram_benchmark" name = "unigram_benchmark"
harness = false harness = false
[[bench]]
name = "llama3"
required-features = ["http"]
harness = false
[dependencies] [dependencies]
lazy_static = "1.4" lazy_static = "1.4"
rand = "0.8" rand = "0.8"
@ -80,3 +85,8 @@ tracing-subscriber = "0.3.18"
[profile.release] [profile.release]
lto = "fat" lto = "fat"
[[example]]
name = "encode_batch"
required-features = ["http"]

View File

@ -0,0 +1,42 @@
#[macro_use]
extern crate criterion;
use criterion::{Criterion, Throughput};
use tokenizers::Tokenizer;
pub fn llama3(c: &mut Criterion) {
let data = std::fs::read_to_string("data/big.txt").unwrap();
let mut group = c.benchmark_group("llama3-encode");
group.throughput(Throughput::Bytes(data.bytes().len() as u64));
group.bench_function("llama3-offsets", |b| {
let tokenizer =
Tokenizer::from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", None).unwrap();
let data: Vec<_> = data.lines().collect();
let add_special_tokens = false;
b.iter(|| {
tokenizer
.encode_batch_char_offsets(criterion::black_box(data.clone()), add_special_tokens)
.unwrap()
})
});
group.bench_function("llama3-nooffsets", |b| {
let tokenizer =
Tokenizer::from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", None).unwrap();
let data: Vec<_> = data.lines().collect();
let add_special_tokens = false;
b.iter(|| {
tokenizer
.encode_batch(criterion::black_box(data.clone()), add_special_tokens)
.unwrap()
})
});
group.finish();
}
criterion_group! {
name = bert_benches;
config = Criterion::default().sample_size(10);
targets = llama3
}
criterion_main!(bert_benches);

View File

@ -0,0 +1,11 @@
use tokenizers::Tokenizer;
fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
let tokenizer = Tokenizer::from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", None)?;
let data = std::fs::read_to_string("data/big.txt")?;
let data: Vec<_> = data.lines().collect();
let add_special_tokens = false;
tokenizer.encode_batch_char_offsets(data, add_special_tokens)?;
Ok(())
}

View File

@ -759,6 +759,48 @@ where
} }
} }
/// Encode the given input. This method accepts both single sequences, as well as pair
/// sequences. Also, a sequence can be a string, or already pre-tokenized input directly:
/// Contrarily to `encode`, it does not compute offsets
/// ```
/// # use tokenizers::Tokenizer;
/// # use tokenizers::models::bpe::BPE;
/// # let mut tokenizer = Tokenizer::new(BPE::default());
/// #
/// // Sequences:
/// tokenizer.encode_fast("Single sequence", false);
/// tokenizer.encode_fast(("Sequence A", "Sequence B"), false);
///
/// // Pre-tokenized sequences:
/// tokenizer.encode_fast(&["Single", "sequence"][..], false);
/// tokenizer.encode_fast((
/// &["Sequence", "A"][..],
/// &["Sequence", "B"][..]
/// ), false);
///
/// // or even both types together:
/// tokenizer.encode_fast(("A complete sequence", &["And", "a", "tokenized"][..]), false);
/// ```
pub fn encode_fast<'s, E>(&self, input: E, add_special_tokens: bool) -> Result<Encoding>
where
E: Into<EncodeInput<'s>>,
{
// Extract sequences from the EncodeInput
let (sequence, pair) = match input.into() {
EncodeInput::Single(s1) => (s1, None),
EncodeInput::Dual(s1, s2) => (s1, Some(s2)),
};
// Encode each sequence
let encoding = self.encode_single_sequence(sequence, 0, OffsetType::None)?;
let pair_encoding = pair
.map(|sequence| self.encode_single_sequence(sequence, 1, OffsetType::None))
.transpose()?;
// And finally post process
self.post_process(encoding, pair_encoding, add_special_tokens)
}
/// Encode the given input. This method accepts both single sequences, as well as pair /// Encode the given input. This method accepts both single sequences, as well as pair
/// sequences. Also, a sequence can be a string, or already pre-tokenized input directly: /// sequences. Also, a sequence can be a string, or already pre-tokenized input directly:
/// ///
@ -1059,6 +1101,28 @@ where
Ok(encodings) Ok(encodings)
} }
/// Encode all the sentences in parallel, using multiple threads
pub fn encode_batch_fast<'s, E>(
&self,
inputs: Vec<E>,
add_special_tokens: bool,
) -> Result<Vec<Encoding>>
where
E: Into<EncodeInput<'s>> + Send,
{
let mut encodings = inputs
.into_maybe_par_iter()
.map(|input| self.encode_fast(input, add_special_tokens))
.collect::<Result<Vec<Encoding>>>()?;
if let Some(params) = &self.padding {
// We do the padding here to make sure we handle the batch padding
pad_encodings(&mut encodings, params)?;
}
Ok(encodings)
}
/// Decode all sentences in parallel /// Decode all sentences in parallel
pub fn decode_batch( pub fn decode_batch(
&self, &self,

View File

@ -8,6 +8,7 @@ use std::collections::HashMap;
pub enum OffsetType { pub enum OffsetType {
Byte, Byte,
Char, Char,
None,
} }
/// Wrapper for a subpart of a `NormalizedString`. /// Wrapper for a subpart of a `NormalizedString`.
@ -146,6 +147,19 @@ impl PreTokenizedString {
let offset_converter = match offset_type { let offset_converter = match offset_type {
OffsetType::Char => Some(BytesToCharOffsetConverter::new(&self.original)), OffsetType::Char => Some(BytesToCharOffsetConverter::new(&self.original)),
OffsetType::Byte => None, OffsetType::Byte => None,
OffsetType::None => {
let tokens = self
.splits
.into_iter()
.flat_map(|split| {
split.tokens.unwrap().into_iter().map(|token| {
// Replace this with the actual fields you need for the Encoding type
(token.id, String::with_capacity(0), (0, 0), None, 0)
})
})
.collect();
return Ok(tokens);
}
}; };
Ok(self Ok(self
@ -197,6 +211,7 @@ impl PreTokenizedString {
let offset_converter = match offset_type { let offset_converter = match offset_type {
OffsetType::Char => Some(BytesToCharOffsetConverter::new(&self.original)), OffsetType::Char => Some(BytesToCharOffsetConverter::new(&self.original)),
OffsetType::Byte => None, OffsetType::Byte => None,
OffsetType::None => None,
}; };
let mut offset = 0; let mut offset = 0;