mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Doc - Update PreTokenizer part of the Pipeline page
This commit is contained in:
@ -9,14 +9,14 @@ describe("pipelineExample", () => {
|
|||||||
|
|
||||||
it("", async () => {
|
it("", async () => {
|
||||||
// START reload_tokenizer
|
// START reload_tokenizer
|
||||||
const { Tokenizer } = require("tokenizers/bindings/tokenizer");
|
let { Tokenizer } = require("tokenizers/bindings/tokenizer");
|
||||||
|
|
||||||
const tokenizer = Tokenizer.fromFile("data/tokenizer-wiki.json");
|
let tokenizer = Tokenizer.fromFile("data/tokenizer-wiki.json");
|
||||||
// END reload_tokenizer
|
// END reload_tokenizer
|
||||||
// START setup_normalizer
|
// START setup_normalizer
|
||||||
const { sequenceNormalizer, nfdNormalizer, stripAccentsNormalizer } = require("tokenizers/bindings/normalizers");
|
let { sequenceNormalizer, nfdNormalizer, stripAccentsNormalizer } = require("tokenizers/bindings/normalizers");
|
||||||
|
|
||||||
const normalizer = sequenceNormalizer([nfdNormalizer(), stripAccentsNormalizer()]);
|
let normalizer = sequenceNormalizer([nfdNormalizer(), stripAccentsNormalizer()]);
|
||||||
// END setup_normalizer
|
// END setup_normalizer
|
||||||
// START test_normalizer
|
// START test_normalizer
|
||||||
let normalized = normalizer.normalizeStr("Héllò hôw are ü?")
|
let normalized = normalizer.normalizeStr("Héllò hôw are ü?")
|
||||||
@ -26,5 +26,36 @@ describe("pipelineExample", () => {
|
|||||||
// START replace_normalizer
|
// START replace_normalizer
|
||||||
tokenizer.setNormalizer(normalizer)
|
tokenizer.setNormalizer(normalizer)
|
||||||
// END replace_normalizer
|
// END replace_normalizer
|
||||||
|
// START setup_pre_tokenizer
|
||||||
|
let { whitespacePreTokenizer } = require("tokenizers/bindings/pre_tokenizers");
|
||||||
|
|
||||||
|
var preTokenizer = whitespacePreTokenizer();
|
||||||
|
var preTokenized = preTokenizer.preTokenizeStr("Hello! How are you? I'm fine, thank you.");
|
||||||
|
// END setup_pre_tokenizer
|
||||||
|
expect(preTokenized).toEqual([
|
||||||
|
["Hello", [0, 5]],
|
||||||
|
["!", [5, 6]],
|
||||||
|
["How", [7, 10]],
|
||||||
|
["are", [11, 14]],
|
||||||
|
["you", [15, 18]],
|
||||||
|
["?", [18, 19]],
|
||||||
|
["I", [20, 21]],
|
||||||
|
["'", [21, 22]],
|
||||||
|
['m', [22, 23]],
|
||||||
|
["fine", [24, 28]],
|
||||||
|
[",", [28, 29]],
|
||||||
|
["thank", [30, 35]],
|
||||||
|
["you", [36, 39]],
|
||||||
|
[".", [39, 40]]
|
||||||
|
]);
|
||||||
|
// START combine_pre_tokenizer
|
||||||
|
let { sequencePreTokenizer, digitsPreTokenizer } = require("tokenizers/bindings/pre_tokenizers");
|
||||||
|
|
||||||
|
var preTokenizer = sequencePreTokenizer([whitespacePreTokenizer(), digitsPreTokenizer(true)]);
|
||||||
|
var preTokenized = preTokenizer.preTokenizeStr("Call 911!");
|
||||||
|
// END combine_pre_tokenizer
|
||||||
|
// START replace_pre_tokenizer
|
||||||
|
tokenizer.setPreTokenizer(preTokenizer)
|
||||||
|
// END replace_pre_tokenizer
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
@ -30,3 +30,46 @@ class TestPipeline:
|
|||||||
# START replace_normalizer
|
# START replace_normalizer
|
||||||
tokenizer.normalizer = normalizer
|
tokenizer.normalizer = normalizer
|
||||||
# END replace_normalizer
|
# END replace_normalizer
|
||||||
|
# START setup_pre_tokenizer
|
||||||
|
from tokenizers.pre_tokenizers import Whitespace
|
||||||
|
|
||||||
|
pre_tokenizer = Whitespace()
|
||||||
|
pre_tokenizer.pre_tokenize_str("Hello! How are you? I'm fine, thank you.")
|
||||||
|
# [("Hello", (0, 5)), ("!", (5, 6)), ("How", (7, 10)), ("are", (11, 14)), ("you", (15, 18)),
|
||||||
|
# ("?", (18, 19)), ("I", (20, 21)), ("'", (21, 22)), ('m', (22, 23)), ("fine", (24, 28)),
|
||||||
|
# (",", (28, 29)), ("thank", (30, 35)), ("you", (36, 39)), (".", (39, 40))]
|
||||||
|
# END setup_pre_tokenizer
|
||||||
|
assert pre_tokenizer.pre_tokenize_str("Hello! How are you? I'm fine, thank you.") == [
|
||||||
|
("Hello", (0, 5)),
|
||||||
|
("!", (5, 6)),
|
||||||
|
("How", (7, 10)),
|
||||||
|
("are", (11, 14)),
|
||||||
|
("you", (15, 18)),
|
||||||
|
("?", (18, 19)),
|
||||||
|
("I", (20, 21)),
|
||||||
|
("'", (21, 22)),
|
||||||
|
("m", (22, 23)),
|
||||||
|
("fine", (24, 28)),
|
||||||
|
(",", (28, 29)),
|
||||||
|
("thank", (30, 35)),
|
||||||
|
("you", (36, 39)),
|
||||||
|
(".", (39, 40)),
|
||||||
|
]
|
||||||
|
# START combine_pre_tokenizer
|
||||||
|
from tokenizers import pre_tokenizers
|
||||||
|
from tokenizers.pre_tokenizers import Digits
|
||||||
|
|
||||||
|
pre_tokenizer = pre_tokenizers.Sequence([Whitespace(), Digits(individual_digits=True)])
|
||||||
|
pre_tokenizer.pre_tokenize_str("Call 911!")
|
||||||
|
# [("Call", (0, 4)), ("9", (5, 6)), ("1", (6, 7)), ("1", (7, 8)), ("!", (8, 9))]
|
||||||
|
# END combine_pre_tokenizer
|
||||||
|
assert pre_tokenizer.pre_tokenize_str("Call 911!") == [
|
||||||
|
("Call", (0, 4)),
|
||||||
|
("9", (5, 6)),
|
||||||
|
("1", (6, 7)),
|
||||||
|
("1", (7, 8)),
|
||||||
|
("!", (8, 9)),
|
||||||
|
]
|
||||||
|
# START replace_pre_tokenizer
|
||||||
|
tokenizer.pre_tokenizer = pre_tokenizer
|
||||||
|
# END replace_pre_tokenizer
|
||||||
|
@ -30,6 +30,10 @@
|
|||||||
:class:`~tokenizers.normalizers.Normalizer`
|
:class:`~tokenizers.normalizers.Normalizer`
|
||||||
normalizers.Sequence
|
normalizers.Sequence
|
||||||
:class:`~tokenizers.normalizers.Sequence`
|
:class:`~tokenizers.normalizers.Sequence`
|
||||||
|
pre_tokenizers.Whitespace
|
||||||
|
:class:`~tokenizers.pre_tokenizers.Whitespace`
|
||||||
|
PreTokenizer
|
||||||
|
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||||
|
|
||||||
.. entities:: rust
|
.. entities:: rust
|
||||||
|
|
||||||
@ -63,6 +67,10 @@
|
|||||||
:rust:trait:`~tokenizers::tokenizer::Normalizer`
|
:rust:trait:`~tokenizers::tokenizer::Normalizer`
|
||||||
normalizers.Sequence
|
normalizers.Sequence
|
||||||
:rust:struct:`~tokenizers::normalizers::utils::Sequence`
|
:rust:struct:`~tokenizers::normalizers::utils::Sequence`
|
||||||
|
pre_tokenizers.Whitespace
|
||||||
|
:rust:struct:`~tokenizers::normalizers::whitespace::Whitespace`
|
||||||
|
PreTokenizer
|
||||||
|
:rust:trait:`~tokenizers::tokenizer::PreTokenizer`
|
||||||
|
|
||||||
.. entities:: node
|
.. entities:: node
|
||||||
|
|
||||||
@ -96,3 +104,7 @@
|
|||||||
:obj:`Normalizer`
|
:obj:`Normalizer`
|
||||||
normalizers.Sequence
|
normalizers.Sequence
|
||||||
:obj:`Sequence`
|
:obj:`Sequence`
|
||||||
|
pre_tokenizers.Whitespace
|
||||||
|
:obj:`Whitespace`
|
||||||
|
PreTokenizer
|
||||||
|
:obj:`PreTokenizer`
|
||||||
|
@ -152,43 +152,90 @@ pre-tokenizer will split your text into "words" and then, your final tokens will
|
|||||||
words.
|
words.
|
||||||
|
|
||||||
An easy way to pre-tokenize inputs is to split on spaces and punctuations, which is done by the
|
An easy way to pre-tokenize inputs is to split on spaces and punctuations, which is done by the
|
||||||
:class:`~tokenizers.pre_tokenizers.Whitespace` pre-tokenizer:
|
:entity:`pre_tokenizers.Whitespace` pre-tokenizer:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
from tokenizers.pre_tokenizers import Whitespace
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
|
||||||
|
:language: python
|
||||||
|
:start-after: START setup_pre_tokenizer
|
||||||
|
:end-before: END setup_pre_tokenizer
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
pre_tokenizer = Whitespace()
|
.. only:: rust
|
||||||
pre_tokenizer.pre_tokenize_str("Hello! How are you? I'm fine, thank you.")
|
|
||||||
# [("Hello", (0, 5)), ("!", (5, 6)), ("How", (7, 10)), ("are", (11, 14)), ("you", (15, 18)),
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
# ("?", (18, 19)), ("I", (20, 21)), ("'", (21, 22)), ('m', (22, 23)), ("fine", (24, 28)),
|
:language: rust
|
||||||
# (",", (28, 29)), ("thank", (30, 35)), ("you", (36, 39)), (".", (39, 40))]
|
:start-after: START pipeline_setup_pre_tokenizer
|
||||||
|
:end-before: END pipeline_setup_pre_tokenizer
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START setup_pre_tokenizer
|
||||||
|
:end-before: END setup_pre_tokenizer
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
The output is a list of tuples, with each tuple containing one word and its span in the original
|
The output is a list of tuples, with each tuple containing one word and its span in the original
|
||||||
sentence (which is used to determine the final :obj:`offsets` of our :class:`~tokenizers.Encoding`).
|
sentence (which is used to determine the final :obj:`offsets` of our :entity:`Encoding`).
|
||||||
Note that splitting on punctuation will split contractions like :obj:`"I'm"` in this example.
|
Note that splitting on punctuation will split contractions like :obj:`"I'm"` in this example.
|
||||||
|
|
||||||
You can combine together any :class:`~tokenizers.pre_tokenizers.PreTokenizer` together. For
|
You can combine together any :entity:`PreTokenizer` together. For
|
||||||
instance, here is a pre-tokenizer that will split on space, punctuation and digits, separating
|
instance, here is a pre-tokenizer that will split on space, punctuation and digits, separating
|
||||||
numbers in their individual digits:
|
numbers in their individual digits:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
from tokenizers.pre_tokenizers import Digits
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
|
||||||
|
:language: python
|
||||||
|
:start-after: START combine_pre_tokenizer
|
||||||
|
:end-before: END combine_pre_tokenizer
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
pre_tokenizer = tokenizers.pre_tokenizers.Sequence([
|
.. only:: rust
|
||||||
Whitespace(),
|
|
||||||
Digits(individual_digits=True),
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
])
|
:language: rust
|
||||||
pre_tokenizer.pre_tokenize_str("Call 911!")
|
:start-after: START pipeline_combine_pre_tokenizer
|
||||||
# [("Call", (0, 4)), ("9", (5, 6)), ("1", (6, 7)), ("1", (7, 8)), ("!", (8, 9))]
|
:end-before: END pipeline_combine_pre_tokenizer
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START combine_pre_tokenizer
|
||||||
|
:end-before: END combine_pre_tokenizer
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
As we saw in the :doc:`quicktour`, you can customize the pre-tokenizer of a
|
As we saw in the :doc:`quicktour`, you can customize the pre-tokenizer of a
|
||||||
:class:`~tokenizers.Tokenizer` by just changing the corresponding attribute:
|
:entity:`Tokenizer` by just changing the corresponding attribute:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
tokenizer.pre_tokenizer = pre_tokenizer
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
|
||||||
|
:language: python
|
||||||
|
:start-after: START replace_pre_tokenizer
|
||||||
|
:end-before: END replace_pre_tokenizer
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
|
.. only:: rust
|
||||||
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START pipeline_replace_pre_tokenizer
|
||||||
|
:end-before: END pipeline_replace_pre_tokenizer
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START replace_pre_tokenizer
|
||||||
|
:end-before: END replace_pre_tokenizer
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
Of course, if you change the way the pre-tokenizer, you should probably retrain your tokenizer from
|
Of course, if you change the way the pre-tokenizer, you should probably retrain your tokenizer from
|
||||||
scratch afterward.
|
scratch afterward.
|
||||||
|
@ -290,6 +290,70 @@ fn pipeline() -> tokenizers::Result<()> {
|
|||||||
// START pipeline_replace_normalizer
|
// START pipeline_replace_normalizer
|
||||||
tokenizer.with_normalizer(normalizer);
|
tokenizer.with_normalizer(normalizer);
|
||||||
// END pipeline_replace_normalizer
|
// END pipeline_replace_normalizer
|
||||||
|
// START pipeline_setup_pre_tokenizer
|
||||||
|
use tokenizers::pre_tokenizers::whitespace::Whitespace;
|
||||||
|
use tokenizers::{OffsetReferential, OffsetType, PreTokenizedString, PreTokenizer};
|
||||||
|
|
||||||
|
let pre_tokenizer = Whitespace::default();
|
||||||
|
let mut pre_tokenized = PreTokenizedString::from("Hello! How are you? I'm fine, thank you.");
|
||||||
|
|
||||||
|
pre_tokenizer.pre_tokenize(&mut pre_tokenized)?;
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"{:?}",
|
||||||
|
pre_tokenized.get_splits(OffsetReferential::Original, OffsetType::Byte)
|
||||||
|
);
|
||||||
|
// [("Hello", (0, 5), None), ("!", (5, 6), None), ("How", (7, 10), None),
|
||||||
|
// ("are", (11, 14), None), ("you", (15, 18), None), ("?", (18, 19), None),
|
||||||
|
// ("I", (20, 21), None), ("\'", (21, 22), None), ("m", (22, 23), None),
|
||||||
|
// ("fine", (24, 28), None), (",", (28, 29), None), ("thank", (30, 35), None),
|
||||||
|
// ("you", (36, 39), None), (".", (39, 40), None)]
|
||||||
|
// END pipeline_setup_pre_tokenizer
|
||||||
|
assert_eq!(
|
||||||
|
pre_tokenized.get_splits(OffsetReferential::Original, OffsetType::Byte),
|
||||||
|
vec![
|
||||||
|
("Hello", (0, 5), &None),
|
||||||
|
("!", (5, 6), &None),
|
||||||
|
("How", (7, 10), &None),
|
||||||
|
("are", (11, 14), &None),
|
||||||
|
("you", (15, 18), &None),
|
||||||
|
("?", (18, 19), &None),
|
||||||
|
("I", (20, 21), &None),
|
||||||
|
("\'", (21, 22), &None),
|
||||||
|
("m", (22, 23), &None),
|
||||||
|
("fine", (24, 28), &None),
|
||||||
|
(",", (28, 29), &None),
|
||||||
|
("thank", (30, 35), &None),
|
||||||
|
("you", (36, 39), &None),
|
||||||
|
(".", (39, 40), &None)
|
||||||
|
]
|
||||||
|
);
|
||||||
|
// START pipeline_combine_pre_tokenizer
|
||||||
|
use tokenizers::pre_tokenizers::{digits::Digits, sequence::Sequence};
|
||||||
|
|
||||||
|
let pre_tokenizer = Sequence::new(vec![Whitespace::default().into(), Digits::new(true).into()]);
|
||||||
|
let mut pre_tokenized = PreTokenizedString::from("Call 911!");
|
||||||
|
|
||||||
|
pre_tokenizer.pre_tokenize(&mut pre_tokenized)?;
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"{:?}",
|
||||||
|
pre_tokenized.get_splits(OffsetReferential::Original, OffsetType::Byte)
|
||||||
|
);
|
||||||
|
// END pipeline_combine_pre_tokenizer
|
||||||
|
assert_eq!(
|
||||||
|
pre_tokenized.get_splits(OffsetReferential::Original, OffsetType::Byte),
|
||||||
|
vec![
|
||||||
|
("Call", (0, 4), &None),
|
||||||
|
("9", (5, 6), &None),
|
||||||
|
("1", (6, 7), &None),
|
||||||
|
("1", (7, 8), &None),
|
||||||
|
("!", (8, 9), &None)
|
||||||
|
]
|
||||||
|
);
|
||||||
|
// START pipeline_replace_pre_tokenizer
|
||||||
|
tokenizer.with_pre_tokenizer(pre_tokenizer);
|
||||||
|
// END pipeline_replace_pre_tokenizer
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user