diff --git a/bindings/node/examples/documentation/pipeline.test.ts b/bindings/node/examples/documentation/pipeline.test.ts index cefd8c7c..b79ea6cf 100644 --- a/bindings/node/examples/documentation/pipeline.test.ts +++ b/bindings/node/examples/documentation/pipeline.test.ts @@ -9,14 +9,14 @@ describe("pipelineExample", () => { it("", async () => { // START reload_tokenizer - const { Tokenizer } = require("tokenizers/bindings/tokenizer"); + let { Tokenizer } = require("tokenizers/bindings/tokenizer"); - const tokenizer = Tokenizer.fromFile("data/tokenizer-wiki.json"); + let tokenizer = Tokenizer.fromFile("data/tokenizer-wiki.json"); // END reload_tokenizer // START setup_normalizer - const { sequenceNormalizer, nfdNormalizer, stripAccentsNormalizer } = require("tokenizers/bindings/normalizers"); + let { sequenceNormalizer, nfdNormalizer, stripAccentsNormalizer } = require("tokenizers/bindings/normalizers"); - const normalizer = sequenceNormalizer([nfdNormalizer(), stripAccentsNormalizer()]); + let normalizer = sequenceNormalizer([nfdNormalizer(), stripAccentsNormalizer()]); // END setup_normalizer // START test_normalizer let normalized = normalizer.normalizeStr("Héllò hôw are ü?") @@ -26,5 +26,36 @@ describe("pipelineExample", () => { // START replace_normalizer tokenizer.setNormalizer(normalizer) // END replace_normalizer + // START setup_pre_tokenizer + let { whitespacePreTokenizer } = require("tokenizers/bindings/pre_tokenizers"); + + var preTokenizer = whitespacePreTokenizer(); + var preTokenized = preTokenizer.preTokenizeStr("Hello! How are you? I'm fine, thank you."); + // END setup_pre_tokenizer + expect(preTokenized).toEqual([ + ["Hello", [0, 5]], + ["!", [5, 6]], + ["How", [7, 10]], + ["are", [11, 14]], + ["you", [15, 18]], + ["?", [18, 19]], + ["I", [20, 21]], + ["'", [21, 22]], + ['m', [22, 23]], + ["fine", [24, 28]], + [",", [28, 29]], + ["thank", [30, 35]], + ["you", [36, 39]], + [".", [39, 40]] + ]); + // START combine_pre_tokenizer + let { sequencePreTokenizer, digitsPreTokenizer } = require("tokenizers/bindings/pre_tokenizers"); + + var preTokenizer = sequencePreTokenizer([whitespacePreTokenizer(), digitsPreTokenizer(true)]); + var preTokenized = preTokenizer.preTokenizeStr("Call 911!"); + // END combine_pre_tokenizer + // START replace_pre_tokenizer + tokenizer.setPreTokenizer(preTokenizer) + // END replace_pre_tokenizer }); }); diff --git a/bindings/python/tests/documentation/test_pipeline.py b/bindings/python/tests/documentation/test_pipeline.py index 30c58f93..4b13e905 100644 --- a/bindings/python/tests/documentation/test_pipeline.py +++ b/bindings/python/tests/documentation/test_pipeline.py @@ -30,3 +30,46 @@ class TestPipeline: # START replace_normalizer tokenizer.normalizer = normalizer # END replace_normalizer + # START setup_pre_tokenizer + from tokenizers.pre_tokenizers import Whitespace + + pre_tokenizer = Whitespace() + pre_tokenizer.pre_tokenize_str("Hello! How are you? I'm fine, thank you.") + # [("Hello", (0, 5)), ("!", (5, 6)), ("How", (7, 10)), ("are", (11, 14)), ("you", (15, 18)), + # ("?", (18, 19)), ("I", (20, 21)), ("'", (21, 22)), ('m', (22, 23)), ("fine", (24, 28)), + # (",", (28, 29)), ("thank", (30, 35)), ("you", (36, 39)), (".", (39, 40))] + # END setup_pre_tokenizer + assert pre_tokenizer.pre_tokenize_str("Hello! How are you? I'm fine, thank you.") == [ + ("Hello", (0, 5)), + ("!", (5, 6)), + ("How", (7, 10)), + ("are", (11, 14)), + ("you", (15, 18)), + ("?", (18, 19)), + ("I", (20, 21)), + ("'", (21, 22)), + ("m", (22, 23)), + ("fine", (24, 28)), + (",", (28, 29)), + ("thank", (30, 35)), + ("you", (36, 39)), + (".", (39, 40)), + ] + # START combine_pre_tokenizer + from tokenizers import pre_tokenizers + from tokenizers.pre_tokenizers import Digits + + pre_tokenizer = pre_tokenizers.Sequence([Whitespace(), Digits(individual_digits=True)]) + pre_tokenizer.pre_tokenize_str("Call 911!") + # [("Call", (0, 4)), ("9", (5, 6)), ("1", (6, 7)), ("1", (7, 8)), ("!", (8, 9))] + # END combine_pre_tokenizer + assert pre_tokenizer.pre_tokenize_str("Call 911!") == [ + ("Call", (0, 4)), + ("9", (5, 6)), + ("1", (6, 7)), + ("1", (7, 8)), + ("!", (8, 9)), + ] + # START replace_pre_tokenizer + tokenizer.pre_tokenizer = pre_tokenizer + # END replace_pre_tokenizer diff --git a/docs/source/entities.inc b/docs/source/entities.inc index 59390a46..6364c3df 100644 --- a/docs/source/entities.inc +++ b/docs/source/entities.inc @@ -30,6 +30,10 @@ :class:`~tokenizers.normalizers.Normalizer` normalizers.Sequence :class:`~tokenizers.normalizers.Sequence` + pre_tokenizers.Whitespace + :class:`~tokenizers.pre_tokenizers.Whitespace` + PreTokenizer + :class:`~tokenizers.pre_tokenizers.PreTokenizer` .. entities:: rust @@ -63,6 +67,10 @@ :rust:trait:`~tokenizers::tokenizer::Normalizer` normalizers.Sequence :rust:struct:`~tokenizers::normalizers::utils::Sequence` + pre_tokenizers.Whitespace + :rust:struct:`~tokenizers::normalizers::whitespace::Whitespace` + PreTokenizer + :rust:trait:`~tokenizers::tokenizer::PreTokenizer` .. entities:: node @@ -96,3 +104,7 @@ :obj:`Normalizer` normalizers.Sequence :obj:`Sequence` + pre_tokenizers.Whitespace + :obj:`Whitespace` + PreTokenizer + :obj:`PreTokenizer` diff --git a/docs/source/pipeline.rst b/docs/source/pipeline.rst index 153140ba..88425d3c 100644 --- a/docs/source/pipeline.rst +++ b/docs/source/pipeline.rst @@ -152,43 +152,90 @@ pre-tokenizer will split your text into "words" and then, your final tokens will words. An easy way to pre-tokenize inputs is to split on spaces and punctuations, which is done by the -:class:`~tokenizers.pre_tokenizers.Whitespace` pre-tokenizer: +:entity:`pre_tokenizers.Whitespace` pre-tokenizer: -.. code-block:: python +.. only:: python - from tokenizers.pre_tokenizers import Whitespace + .. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py + :language: python + :start-after: START setup_pre_tokenizer + :end-before: END setup_pre_tokenizer + :dedent: 8 - pre_tokenizer = Whitespace() - pre_tokenizer.pre_tokenize_str("Hello! How are you? I'm fine, thank you.") - # [("Hello", (0, 5)), ("!", (5, 6)), ("How", (7, 10)), ("are", (11, 14)), ("you", (15, 18)), - # ("?", (18, 19)), ("I", (20, 21)), ("'", (21, 22)), ('m', (22, 23)), ("fine", (24, 28)), - # (",", (28, 29)), ("thank", (30, 35)), ("you", (36, 39)), (".", (39, 40))] +.. only:: rust + + .. literalinclude:: ../../tokenizers/tests/documentation.rs + :language: rust + :start-after: START pipeline_setup_pre_tokenizer + :end-before: END pipeline_setup_pre_tokenizer + :dedent: 4 + +.. only:: node + + .. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts + :language: javascript + :start-after: START setup_pre_tokenizer + :end-before: END setup_pre_tokenizer + :dedent: 8 The output is a list of tuples, with each tuple containing one word and its span in the original -sentence (which is used to determine the final :obj:`offsets` of our :class:`~tokenizers.Encoding`). +sentence (which is used to determine the final :obj:`offsets` of our :entity:`Encoding`). Note that splitting on punctuation will split contractions like :obj:`"I'm"` in this example. -You can combine together any :class:`~tokenizers.pre_tokenizers.PreTokenizer` together. For +You can combine together any :entity:`PreTokenizer` together. For instance, here is a pre-tokenizer that will split on space, punctuation and digits, separating numbers in their individual digits: -.. code-block:: python +.. only:: python - from tokenizers.pre_tokenizers import Digits + .. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py + :language: python + :start-after: START combine_pre_tokenizer + :end-before: END combine_pre_tokenizer + :dedent: 8 - pre_tokenizer = tokenizers.pre_tokenizers.Sequence([ - Whitespace(), - Digits(individual_digits=True), - ]) - pre_tokenizer.pre_tokenize_str("Call 911!") - # [("Call", (0, 4)), ("9", (5, 6)), ("1", (6, 7)), ("1", (7, 8)), ("!", (8, 9))] +.. only:: rust + + .. literalinclude:: ../../tokenizers/tests/documentation.rs + :language: rust + :start-after: START pipeline_combine_pre_tokenizer + :end-before: END pipeline_combine_pre_tokenizer + :dedent: 4 + +.. only:: node + + .. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts + :language: javascript + :start-after: START combine_pre_tokenizer + :end-before: END combine_pre_tokenizer + :dedent: 8 As we saw in the :doc:`quicktour`, you can customize the pre-tokenizer of a -:class:`~tokenizers.Tokenizer` by just changing the corresponding attribute: +:entity:`Tokenizer` by just changing the corresponding attribute: -.. code-block:: python +.. only:: python - tokenizer.pre_tokenizer = pre_tokenizer + .. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py + :language: python + :start-after: START replace_pre_tokenizer + :end-before: END replace_pre_tokenizer + :dedent: 8 + +.. only:: rust + + .. literalinclude:: ../../tokenizers/tests/documentation.rs + :language: rust + :start-after: START pipeline_replace_pre_tokenizer + :end-before: END pipeline_replace_pre_tokenizer + :dedent: 4 + +.. only:: node + + .. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts + :language: javascript + :start-after: START replace_pre_tokenizer + :end-before: END replace_pre_tokenizer + :dedent: 8 Of course, if you change the way the pre-tokenizer, you should probably retrain your tokenizer from scratch afterward. diff --git a/tokenizers/tests/documentation.rs b/tokenizers/tests/documentation.rs index fb03b247..75d865b2 100644 --- a/tokenizers/tests/documentation.rs +++ b/tokenizers/tests/documentation.rs @@ -290,6 +290,70 @@ fn pipeline() -> tokenizers::Result<()> { // START pipeline_replace_normalizer tokenizer.with_normalizer(normalizer); // END pipeline_replace_normalizer + // START pipeline_setup_pre_tokenizer + use tokenizers::pre_tokenizers::whitespace::Whitespace; + use tokenizers::{OffsetReferential, OffsetType, PreTokenizedString, PreTokenizer}; + + let pre_tokenizer = Whitespace::default(); + let mut pre_tokenized = PreTokenizedString::from("Hello! How are you? I'm fine, thank you."); + + pre_tokenizer.pre_tokenize(&mut pre_tokenized)?; + + println!( + "{:?}", + pre_tokenized.get_splits(OffsetReferential::Original, OffsetType::Byte) + ); + // [("Hello", (0, 5), None), ("!", (5, 6), None), ("How", (7, 10), None), + // ("are", (11, 14), None), ("you", (15, 18), None), ("?", (18, 19), None), + // ("I", (20, 21), None), ("\'", (21, 22), None), ("m", (22, 23), None), + // ("fine", (24, 28), None), (",", (28, 29), None), ("thank", (30, 35), None), + // ("you", (36, 39), None), (".", (39, 40), None)] + // END pipeline_setup_pre_tokenizer + assert_eq!( + pre_tokenized.get_splits(OffsetReferential::Original, OffsetType::Byte), + vec![ + ("Hello", (0, 5), &None), + ("!", (5, 6), &None), + ("How", (7, 10), &None), + ("are", (11, 14), &None), + ("you", (15, 18), &None), + ("?", (18, 19), &None), + ("I", (20, 21), &None), + ("\'", (21, 22), &None), + ("m", (22, 23), &None), + ("fine", (24, 28), &None), + (",", (28, 29), &None), + ("thank", (30, 35), &None), + ("you", (36, 39), &None), + (".", (39, 40), &None) + ] + ); + // START pipeline_combine_pre_tokenizer + use tokenizers::pre_tokenizers::{digits::Digits, sequence::Sequence}; + + let pre_tokenizer = Sequence::new(vec![Whitespace::default().into(), Digits::new(true).into()]); + let mut pre_tokenized = PreTokenizedString::from("Call 911!"); + + pre_tokenizer.pre_tokenize(&mut pre_tokenized)?; + + println!( + "{:?}", + pre_tokenized.get_splits(OffsetReferential::Original, OffsetType::Byte) + ); + // END pipeline_combine_pre_tokenizer + assert_eq!( + pre_tokenized.get_splits(OffsetReferential::Original, OffsetType::Byte), + vec![ + ("Call", (0, 4), &None), + ("9", (5, 6), &None), + ("1", (6, 7), &None), + ("1", (7, 8), &None), + ("!", (8, 9), &None) + ] + ); + // START pipeline_replace_pre_tokenizer + tokenizer.with_pre_tokenizer(pre_tokenizer); + // END pipeline_replace_pre_tokenizer Ok(()) }