Customize the doc for each language

2025-12-16 17:18:43 +00:00 · 2020-10-01 16:53:04 -04:00
parent 7366b9e797
commit e865b7cd7c
4 changed files with 44 additions and 31 deletions
--- a/bindings/node/examples/load.test.js
+++ b/bindings/node/examples/load.test.js
@@ -9,7 +9,9 @@ describe("loadExample", () => {
    const ids = [713, 16, 41, 1246];
    const tokens = ["This", "Ġis", "Ġan", "Ġexample"];

+    // START load
    const tokenizer = tokenizers.Tokenizer.fromFile("data/roberta.json");
+    // END load

    // You could also use regular callbacks
    const encode = promisify(tokenizer.encode.bind(tokenizer));
--- a/bindings/python/tests/examples/test_load.py
+++ b/bindings/python/tests/examples/test_load.py
@@ -1,7 +1,8 @@
 from tokenizers import Tokenizer

-
+# START load
 tokenizer = Tokenizer.from_file("data/roberta.json")
+# END load

 example = "This is an example"
 ids = [713, 16, 41, 1246]
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -3,19 +3,17 @@
   You can adapt this file completely to your liking, but it should at least
   contain the root `toctree` directive.

-Welcome to tokenizers's documentation!
+Tokenizers
 ======================================

-.. toctree::
+Fast State-of-the-art tokenizers, optimized for both research and production

-    tokenizer_blocks
+`🤗 Tokenizers`_ provides an implementation of today's most used tokenizers, with
+a focus on performance and versatility. These tokenizers are also used in
+`🤗 Transformers`_.

-Getting started
-==================
-
-
-Provides an implementation of today's most used tokenizers, with a focus on performance and
-versatility.
+.. _🤗 Tokenizers: https://github.com/huggingface/tokenizers
+.. _🤗 Transformers: https://github.com/huggingface/transformers

 Main features:
 --------------
@@ -24,36 +22,46 @@ Main features:
 - Extremely fast (both training and tokenization), thanks to the Rust implementation. Takes
   less than 20 seconds to tokenize a GB of text on a server's CPU.
 - Easy to use, but also extremely versatile.
- - Designed for research and production.
- - Normalization comes with alignments tracking. It's always possible to get the part of the
-   original sentence that corresponds to a given token.
- - Does all the pre-processing: Truncate, Pad, add the special tokens your model needs.
- - Bindings to Rust, Python and Node.
+ - Designed for both research and production.
+ - Full alignment tracking. Even with destructive normalization, it's always possible to get
+   the part of the original sentence that corresponds to any token.
+ - Does all the pre-processing: Truncation, Padding, add the special tokens your model needs.
+
+Components:
+------------
+
+.. toctree::
+    :maxdepth: 2
+
+    tokenizer_blocks

 Load an existing tokenizer:
 ---------------------------

+Loading a previously saved tokenizer is extremely simple and requires a single line of code:

-.. tabs::
-
-   .. group-tab:: Rust
+.. only:: Rust

  .. literalinclude:: ../../tokenizers/examples/load.rs
     :language: rust
-         :emphasize-lines: 4
+     :start-after: START load
+     :end-before: END load
+     :dedent: 4

-   .. group-tab:: Python
+.. only:: Python

  .. literalinclude:: ../../bindings/python/tests/examples/test_load.py
     :language: python
-         :emphasize-lines: 4
+     :start-after: START load
+     :end-before: END load

-   .. group-tab:: Node
+.. only:: Node

  .. literalinclude:: ../../bindings/node/examples/load.test.js
     :language: javascript
-         :emphasize-lines: 11
-
+     :start-after: START load
+     :end-before: END load
+     :dedent: 4


 Train a tokenizer:
--- a/tokenizers/examples/load.rs
+++ b/tokenizers/examples/load.rs
@@ -1,7 +1,9 @@
 use tokenizers::Tokenizer;

 fn main() {
-    let tokenizer: Tokenizer = Tokenizer::from_file("data/roberta.json").unwrap();
+    // START load
+    let tokenizer = Tokenizer::from_file("data/roberta.json").unwrap();
+    // END load

    let example = "This is an example";
    let ids = vec![713, 16, 41, 1246];