mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Update README.md (#1019)
* Update README.md Add reference to normalizer blog post * Update lib.rs * Fixing PR + clippy on node. * Update readme to match docstring. * Other clippy warning. Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
@ -31,7 +31,7 @@ impl FromJsValue for JsSplitDelimiterBehavior {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'s> From<JsSplitDelimiterBehavior> for SplitDelimiterBehavior {
|
||||
impl From<JsSplitDelimiterBehavior> for SplitDelimiterBehavior {
|
||||
fn from(v: JsSplitDelimiterBehavior) -> Self {
|
||||
v.0
|
||||
}
|
||||
|
@ -26,6 +26,8 @@ The various steps of the pipeline are:
|
||||
|
||||
1. The `Normalizer`: in charge of normalizing the text. Common examples of normalization are
|
||||
the [unicode normalization standards](https://unicode.org/reports/tr15/#Norm_Forms), such as `NFD` or `NFKC`.
|
||||
More details about how to use the `Normalizers` are available on the
|
||||
[Hugging Face blog](https://huggingface.co/docs/tokenizers/components#normalizers)
|
||||
2. The `PreTokenizer`: in charge of creating initial words splits in the text. The most common way of
|
||||
splitting text is simply on whitespace.
|
||||
3. The `Model`: in charge of doing the actual tokenization. An example of a `Model` would be
|
||||
|
@ -14,6 +14,8 @@
|
||||
//!
|
||||
//! 1. The `Normalizer`: in charge of normalizing the text. Common examples of normalization are
|
||||
//! the [unicode normalization standards](https://unicode.org/reports/tr15/#Norm_Forms), such as `NFD` or `NFKC`.
|
||||
//! More details about how to use the `Normalizers` are available on the
|
||||
//! [Hugging Face blog](https://huggingface.co/docs/tokenizers/components#normalizers)
|
||||
//! 2. The `PreTokenizer`: in charge of creating initial words splits in the text. The most common way of
|
||||
//! splitting text is simply on whitespace.
|
||||
//! 3. The `Model`: in charge of doing the actual tokenization. An example of a `Model` would be
|
||||
|
@ -136,7 +136,7 @@ fn log_sum_exp(x: f64, y: f64, init_mode: bool) -> f64 {
|
||||
|
||||
impl<'a> Lattice<'a> {
|
||||
pub fn from(sentence: &'a str, bos_id: usize, eos_id: usize) -> Self {
|
||||
let len = sentence.bytes().count();
|
||||
let len = sentence.len();
|
||||
let k_reserved_node_size = 16;
|
||||
// We are adding 2 tokens, bos and eos
|
||||
let mut nodes: Vec<NodeRef> = Vec::with_capacity(k_reserved_node_size);
|
||||
|
Reference in New Issue
Block a user