Update README.md (#1019)

* Update README.md Add reference to normalizer blog post * Update lib.rs * Fixing PR + clippy on node. * Update readme to match docstring. * Other clippy warning. Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
2025-08-22 16:25:30 +00:00 · 2022-07-19 09:54:29 +02:00
parent 3564f24311
commit eb2213842b
4 changed files with 6 additions and 2 deletions
--- a/bindings/node/native/src/pre_tokenizers.rs
+++ b/bindings/node/native/src/pre_tokenizers.rs
@ -31,7 +31,7 @@ impl FromJsValue for JsSplitDelimiterBehavior {
    }
 }

-impl<'s> From<JsSplitDelimiterBehavior> for SplitDelimiterBehavior {
+impl From<JsSplitDelimiterBehavior> for SplitDelimiterBehavior {
    fn from(v: JsSplitDelimiterBehavior) -> Self {
        v.0
    }
--- a/tokenizers/README.md
+++ b/tokenizers/README.md
@ -26,6 +26,8 @@ The various steps of the pipeline are:

 1. The `Normalizer`: in charge of normalizing the text. Common examples of normalization are
   the [unicode normalization standards](https://unicode.org/reports/tr15/#Norm_Forms), such as `NFD` or `NFKC`.
+   More details about how to use the `Normalizers` are available on the
+   [Hugging Face blog](https://huggingface.co/docs/tokenizers/components#normalizers)
 2. The `PreTokenizer`: in charge of creating initial words splits in the text. The most common way of
   splitting text is simply on whitespace.
 3. The `Model`: in charge of doing the actual tokenization. An example of a `Model` would be
--- a/tokenizers/src/lib.rs
+++ b/tokenizers/src/lib.rs
@ -14,6 +14,8 @@
 //!
 //! 1. The `Normalizer`: in charge of normalizing the text. Common examples of normalization are
 //!    the [unicode normalization standards](https://unicode.org/reports/tr15/#Norm_Forms), such as `NFD` or `NFKC`.
+//!    More details about how to use the `Normalizers` are available on the
+//!    [Hugging Face blog](https://huggingface.co/docs/tokenizers/components#normalizers)
 //! 2. The `PreTokenizer`: in charge of creating initial words splits in the text. The most common way of
 //!    splitting text is simply on whitespace.
 //! 3. The `Model`: in charge of doing the actual tokenization. An example of a `Model` would be
--- a/tokenizers/src/models/unigram/lattice.rs
+++ b/tokenizers/src/models/unigram/lattice.rs
@ -136,7 +136,7 @@ fn log_sum_exp(x: f64, y: f64, init_mode: bool) -> f64 {

 impl<'a> Lattice<'a> {
    pub fn from(sentence: &'a str, bos_id: usize, eos_id: usize) -> Self {
-        let len = sentence.bytes().count();
+        let len = sentence.len();
        let k_reserved_node_size = 16;
        // We are adding 2 tokens, bos and eos
        let mut nodes: Vec<NodeRef> = Vec::with_capacity(k_reserved_node_size);