mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Fix clippy + feature test management. (#1580)
* Fix clippy + feature test management. * That example was local oops. * CLippy fix. * Readme indentation. * README update.
This commit is contained in:
@ -128,9 +128,9 @@ fn main() -> Result<()> {
|
|||||||
## Additional information
|
## Additional information
|
||||||
|
|
||||||
- tokenizers is designed to leverage CPU parallelism when possible. The level of parallelism is determined
|
- tokenizers is designed to leverage CPU parallelism when possible. The level of parallelism is determined
|
||||||
by the total number of core/threads your CPU provides but this can be tuned by setting the `RAYON_RS_NUM_THREADS`
|
by the total number of core/threads your CPU provides but this can be tuned by setting the `RAYON_RS_NUM_THREADS`
|
||||||
environment variable. As an example setting `RAYON_RS_NUM_THREADS=4` will allocate a maximum of 4 threads.
|
environment variable. As an example setting `RAYON_RS_NUM_THREADS=4` will allocate a maximum of 4 threads.
|
||||||
**_Please note this behavior may evolve in the future_**
|
**_Please note this behavior may evolve in the future_**
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
**progressbar**: The progress bar visualization is enabled by default. It might be disabled if
|
**progressbar**: The progress bar visualization is enabled by default. It might be disabled if
|
||||||
|
@ -116,9 +116,9 @@
|
|||||||
//! # Additional information
|
//! # Additional information
|
||||||
//!
|
//!
|
||||||
//! - tokenizers is designed to leverage CPU parallelism when possible. The level of parallelism is determined
|
//! - tokenizers is designed to leverage CPU parallelism when possible. The level of parallelism is determined
|
||||||
//! by the total number of core/threads your CPU provides but this can be tuned by setting the `RAYON_RS_NUM_THREADS`
|
//! by the total number of core/threads your CPU provides but this can be tuned by setting the `RAYON_RS_NUM_THREADS`
|
||||||
//! environment variable. As an example setting `RAYON_RS_NUM_THREADS=4` will allocate a maximum of 4 threads.
|
//! environment variable. As an example setting `RAYON_RS_NUM_THREADS=4` will allocate a maximum of 4 threads.
|
||||||
//! **_Please note this behavior may evolve in the future_**
|
//! **_Please note this behavior may evolve in the future_**
|
||||||
//!
|
//!
|
||||||
//! # Features
|
//! # Features
|
||||||
//! **progressbar**: The progress bar visualization is enabled by default. It might be disabled if
|
//! **progressbar**: The progress bar visualization is enabled by default. It might be disabled if
|
||||||
|
@ -11,6 +11,7 @@
|
|||||||
//! sequences. The final result looks like this:
|
//! sequences. The final result looks like this:
|
||||||
//! - Single sequence: `[CLS] Hello there [SEP]`
|
//! - Single sequence: `[CLS] Hello there [SEP]`
|
||||||
//! - Pair sequences: `[CLS] My name is Anthony [SEP] What is my name? [SEP]`
|
//! - Pair sequences: `[CLS] My name is Anthony [SEP] What is my name? [SEP]`
|
||||||
|
//!
|
||||||
//! With the type ids as following:
|
//! With the type ids as following:
|
||||||
//! ```markdown
|
//! ```markdown
|
||||||
//! [CLS] ... [SEP] ... [SEP]
|
//! [CLS] ... [SEP] ... [SEP]
|
||||||
@ -75,8 +76,8 @@ pub enum Sequence {
|
|||||||
/// It can be either the input sequence or a [`SpecialToken`]:
|
/// It can be either the input sequence or a [`SpecialToken`]:
|
||||||
///
|
///
|
||||||
/// - The `Sequence` has an associated `type_id` which is used by default
|
/// - The `Sequence` has an associated `type_id` which is used by default
|
||||||
/// for any token inside this sequence. The `Sequence` corresponds to one
|
/// for any token inside this sequence. The `Sequence` corresponds to one
|
||||||
/// of the input sequence given as input of the `PostProcessor`.
|
/// of the input sequence given as input of the `PostProcessor`.
|
||||||
///
|
///
|
||||||
/// - The `SpecialToken` has an associated `id`. It corresponds to a [`SpecialToken`].
|
/// - The `SpecialToken` has an associated `id`. It corresponds to a [`SpecialToken`].
|
||||||
///
|
///
|
||||||
|
@ -3,11 +3,11 @@
|
|||||||
//! A [`Tokenizer`](struct.Tokenizer.html) is composed of some of the following parts.
|
//! A [`Tokenizer`](struct.Tokenizer.html) is composed of some of the following parts.
|
||||||
//! - [`Normalizer`](trait.Normalizer.html): Takes care of the text normalization (like unicode normalization).
|
//! - [`Normalizer`](trait.Normalizer.html): Takes care of the text normalization (like unicode normalization).
|
||||||
//! - [`PreTokenizer`](trait.PreTokenizer.html): Takes care of the pre tokenization (ie. How to split tokens and pre-process
|
//! - [`PreTokenizer`](trait.PreTokenizer.html): Takes care of the pre tokenization (ie. How to split tokens and pre-process
|
||||||
//! them.
|
//! them.
|
||||||
//! - [`Model`](trait.Model.html): A model encapsulates the tokenization algorithm (like BPE, Word base, character
|
//! - [`Model`](trait.Model.html): A model encapsulates the tokenization algorithm (like BPE, Word base, character
|
||||||
//! based, ...).
|
//! based, ...).
|
||||||
//! - [`PostProcessor`](trait.PostProcessor.html): Takes care of the processing after tokenization (like truncating, padding,
|
//! - [`PostProcessor`](trait.PostProcessor.html): Takes care of the processing after tokenization (like truncating, padding,
|
||||||
//! ...).
|
//! ...).
|
||||||
|
|
||||||
use std::{
|
use std::{
|
||||||
collections::HashMap,
|
collections::HashMap,
|
||||||
@ -1297,17 +1297,13 @@ where
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
|
|
||||||
use crate::AddedToken;
|
|
||||||
use crate::Tokenizer;
|
|
||||||
|
|
||||||
#[cfg(feature = "http")]
|
#[cfg(feature = "http")]
|
||||||
#[test]
|
#[test]
|
||||||
fn test_decoding_with_added_bpe() {
|
fn test_decoding_with_added_bpe() {
|
||||||
use crate::{
|
use crate::{
|
||||||
normalizers,
|
normalizers,
|
||||||
pre_tokenizers::split::{Split, SplitPattern},
|
pre_tokenizers::split::{Split, SplitPattern},
|
||||||
NormalizerWrapper, PreTokenizerWrapper, SplitDelimiterBehavior,
|
AddedToken, NormalizerWrapper, PreTokenizerWrapper, SplitDelimiterBehavior, Tokenizer,
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut tokenizer = Tokenizer::from_pretrained("meta-llama/Meta-Llama-3-8B", None).unwrap();
|
let mut tokenizer = Tokenizer::from_pretrained("meta-llama/Meta-Llama-3-8B", None).unwrap();
|
||||||
|
@ -305,6 +305,7 @@ impl NormalizedString {
|
|||||||
/// - `1` if this is a new char
|
/// - `1` if this is a new char
|
||||||
/// - `-N` if the char is right before N removed chars
|
/// - `-N` if the char is right before N removed chars
|
||||||
/// - `0` if the char is replacing the existing one
|
/// - `0` if the char is replacing the existing one
|
||||||
|
///
|
||||||
/// Since it is possible that the normalized string doesn't include some of the characters at
|
/// Since it is possible that the normalized string doesn't include some of the characters at
|
||||||
/// the beginning of the original one, we need an `initial_offset` which represents the number
|
/// the beginning of the original one, we need an `initial_offset` which represents the number
|
||||||
/// of removed chars at the very beginning.
|
/// of removed chars at the very beginning.
|
||||||
@ -424,6 +425,7 @@ impl NormalizedString {
|
|||||||
/// - `1` if this is a new char
|
/// - `1` if this is a new char
|
||||||
/// - `-N` if the char is right before N removed chars
|
/// - `-N` if the char is right before N removed chars
|
||||||
/// - `0` if the char is replacing the existing one
|
/// - `0` if the char is replacing the existing one
|
||||||
|
///
|
||||||
/// Since it is possible that the normalized string doesn't include some of the characters at
|
/// Since it is possible that the normalized string doesn't include some of the characters at
|
||||||
/// the beginning of the original one, we need an `initial_offset` which represents the number
|
/// the beginning of the original one, we need an `initial_offset` which represents the number
|
||||||
/// of removed chars at the very beginning.
|
/// of removed chars at the very beginning.
|
||||||
|
@ -65,9 +65,9 @@ impl PreTokenizedString {
|
|||||||
///
|
///
|
||||||
/// There are only one constraint that *MUST* be respected:
|
/// There are only one constraint that *MUST* be respected:
|
||||||
/// > The produced `NormalizedString`, if combined back together, must have the
|
/// > The produced `NormalizedString`, if combined back together, must have the
|
||||||
/// same `original` string as the original one given to `split_fn`. This concretely
|
/// > same `original` string as the original one given to `split_fn`. This concretely
|
||||||
/// means that for the offset tracking to work as expected, `split_fn` must produce
|
/// > means that for the offset tracking to work as expected, `split_fn` must produce
|
||||||
/// "splits" of the original string.
|
/// > "splits" of the original string.
|
||||||
pub fn split<F, U, R>(&mut self, mut split_fn: F) -> Result<()>
|
pub fn split<F, U, R>(&mut self, mut split_fn: F) -> Result<()>
|
||||||
where
|
where
|
||||||
F: FnMut(usize, NormalizedString) -> Result<U>,
|
F: FnMut(usize, NormalizedString) -> Result<U>,
|
||||||
|
@ -177,7 +177,6 @@ where
|
|||||||
mod tests {
|
mod tests {
|
||||||
use crate::tokenizer::Tokenizer;
|
use crate::tokenizer::Tokenizer;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use tracing_subscriber::fmt;
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_deserialization_serialization_invariant() {
|
fn test_deserialization_serialization_invariant() {
|
||||||
@ -236,7 +235,7 @@ mod tests {
|
|||||||
#[cfg(feature = "http")]
|
#[cfg(feature = "http")]
|
||||||
#[test]
|
#[test]
|
||||||
fn test_from_pretrained() {
|
fn test_from_pretrained() {
|
||||||
fmt()
|
tracing_subscriber::fmt()
|
||||||
.with_max_level(tracing::Level::DEBUG)
|
.with_max_level(tracing::Level::DEBUG)
|
||||||
.with_target(false)
|
.with_target(false)
|
||||||
.init();
|
.init();
|
||||||
|
Reference in New Issue
Block a user