mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-05 12:18:20 +00:00
Python - Bindings for TemplateProcessing
This commit is contained in:
@@ -4,3 +4,4 @@ PostProcessor = processors.PostProcessor
|
|||||||
BertProcessing = processors.BertProcessing
|
BertProcessing = processors.BertProcessing
|
||||||
RobertaProcessing = processors.RobertaProcessing
|
RobertaProcessing = processors.RobertaProcessing
|
||||||
ByteLevel = processors.ByteLevel
|
ByteLevel = processors.ByteLevel
|
||||||
|
TemplateProcessing = processors.TemplateProcessing
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from typing import Tuple
|
from typing import Tuple, Union, List
|
||||||
|
|
||||||
class PostProcessor:
|
class PostProcessor:
|
||||||
""" Base class for all post-processors
|
""" Base class for all post-processors
|
||||||
@@ -89,7 +89,7 @@ class ByteLevel(PostProcessor):
|
|||||||
want the offsets to include these whitespaces, then this PostProcessor must be used.
|
want the offsets to include these whitespaces, then this PostProcessor must be used.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init(self, trim_offsets: bool = True) -> None:
|
def __init__(self, trim_offsets: bool = True) -> None:
|
||||||
""" Instantiate a new ByteLevel
|
""" Instantiate a new ByteLevel
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -97,3 +97,67 @@ class ByteLevel(PostProcessor):
|
|||||||
Whether to trim the whitespaces from the produced offsets.
|
Whether to trim the whitespaces from the produced offsets.
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
Template = Union[str, List[str]]
|
||||||
|
Tokens = List[Union[Tuple[int, str], Tuple[str, int], dict]]
|
||||||
|
|
||||||
|
class TemplateProcessing(PostProcessor):
|
||||||
|
""" TemplateProcessing
|
||||||
|
|
||||||
|
Provides a way to specify templates in order to add the special tokens to each
|
||||||
|
input sequence as relevant.
|
||||||
|
|
||||||
|
Let's take `BERT` tokenizer as an example. It uses two special tokens, used to
|
||||||
|
delimitate each sequence. `[CLS]` is always used at the beginning of the first
|
||||||
|
sequence, and `[SEP]` is added at the end of both the first, and the pair
|
||||||
|
sequences. The final result looks like this:
|
||||||
|
- Single sequence: `[CLS] Hello there [SEP]`
|
||||||
|
- Pair sequences: `[CLS] My name is Anthony [SEP] What is my name? [SEP]`
|
||||||
|
|
||||||
|
You can achieve such behavior using a TemplateProcessing:
|
||||||
|
```
|
||||||
|
TemplateProcessing(
|
||||||
|
seq_a="[CLS] $0 [SEP]",
|
||||||
|
seq_b="$1 [SEP]",
|
||||||
|
special_tokens=[("[CLS]", 1), ("[SEP]", 0)],
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
In this example, $0 and $1 both represent the input sequences. The number in this
|
||||||
|
identifier is actually the default type_id that will be used for each sequence. So,
|
||||||
|
in this case, the first sequence will use 0, while the pair sequence will use 1.
|
||||||
|
|
||||||
|
Note that we are saying the "default" type_id because each SpecialToken can define
|
||||||
|
its own type_id which would override the provided default.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, seq_a: Template, seq_b: Template, special_tokens: Tokens) -> None:
|
||||||
|
""" Instantiate a new TemplateProcessing
|
||||||
|
|
||||||
|
Args:
|
||||||
|
seq_a: Template
|
||||||
|
The template for the first sequence.
|
||||||
|
|
||||||
|
seq_b: Template:
|
||||||
|
The template for the pair sequence.
|
||||||
|
|
||||||
|
special_tokens: Tokens:
|
||||||
|
The list of special tokens used in each sequences
|
||||||
|
|
||||||
|
Template: Union[str, List[str]]:
|
||||||
|
- If a `str` is provided, the whitespace is used as delimiter between tokens
|
||||||
|
- If a `List[str]` is provided, a list of tokens
|
||||||
|
|
||||||
|
Tokens: List[Union[Tuple[int, str], Tuple[str, int], dict]]:
|
||||||
|
- A Tuple with both a token and its associated ID, in any order
|
||||||
|
- A dict with the following keys:
|
||||||
|
- "id": str => The special token id, as specified in the Template
|
||||||
|
- "ids": List[int] => The associated IDs
|
||||||
|
- "tokens": List[str] => The associated tokens
|
||||||
|
- "type_ids": Optional[List[Optional[int]]] => If specified, a list of optional
|
||||||
|
type_ids. In the `type_id` is not specified, the one from the input sequence
|
||||||
|
will be used.
|
||||||
|
The given dict expects the provided `ids`, `tokens` and `type_ids` lists to have
|
||||||
|
the same length.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
#![warn(clippy::all)]
|
||||||
|
|
||||||
extern crate tokenizers as tk;
|
extern crate tokenizers as tk;
|
||||||
|
|
||||||
mod decoders;
|
mod decoders;
|
||||||
@@ -90,6 +92,7 @@ fn processors(_py: Python, m: &PyModule) -> PyResult<()> {
|
|||||||
m.add_class::<processors::PyBertProcessing>()?;
|
m.add_class::<processors::PyBertProcessing>()?;
|
||||||
m.add_class::<processors::PyRobertaProcessing>()?;
|
m.add_class::<processors::PyRobertaProcessing>()?;
|
||||||
m.add_class::<processors::PyByteLevel>()?;
|
m.add_class::<processors::PyByteLevel>()?;
|
||||||
|
m.add_class::<processors::PyTemplateProcessing>()?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -4,10 +4,12 @@ use pyo3::exceptions;
|
|||||||
use pyo3::prelude::*;
|
use pyo3::prelude::*;
|
||||||
use pyo3::types::*;
|
use pyo3::types::*;
|
||||||
|
|
||||||
|
use crate::error::ToPyResult;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use tk::processors::bert::BertProcessing;
|
use tk::processors::bert::BertProcessing;
|
||||||
use tk::processors::byte_level::ByteLevel;
|
use tk::processors::byte_level::ByteLevel;
|
||||||
use tk::processors::roberta::RobertaProcessing;
|
use tk::processors::roberta::RobertaProcessing;
|
||||||
|
use tk::processors::template::{SpecialToken, Template};
|
||||||
use tk::processors::PostProcessorWrapper;
|
use tk::processors::PostProcessorWrapper;
|
||||||
use tk::{Encoding, PostProcessor};
|
use tk::{Encoding, PostProcessor};
|
||||||
use tokenizers as tk;
|
use tokenizers as tk;
|
||||||
@@ -38,6 +40,9 @@ impl PyPostProcessor {
|
|||||||
PostProcessorWrapper::Roberta(_) => {
|
PostProcessorWrapper::Roberta(_) => {
|
||||||
Py::new(py, (PyRobertaProcessing {}, base)).map(Into::into)
|
Py::new(py, (PyRobertaProcessing {}, base)).map(Into::into)
|
||||||
}
|
}
|
||||||
|
PostProcessorWrapper::Template(_) => {
|
||||||
|
Py::new(py, (PyTemplateProcessing {}, base)).map(Into::into)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -158,6 +163,104 @@ impl PyByteLevel {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct PySpecialToken(SpecialToken);
|
||||||
|
|
||||||
|
impl From<PySpecialToken> for SpecialToken {
|
||||||
|
fn from(v: PySpecialToken) -> Self {
|
||||||
|
v.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FromPyObject<'_> for PySpecialToken {
|
||||||
|
fn extract(ob: &PyAny) -> PyResult<Self> {
|
||||||
|
if let Ok(v) = ob.extract::<(String, u32)>() {
|
||||||
|
Ok(Self(v.into()))
|
||||||
|
} else if let Ok(v) = ob.extract::<(u32, String)>() {
|
||||||
|
Ok(Self(v.into()))
|
||||||
|
} else if let Ok(d) = ob.downcast::<PyDict>() {
|
||||||
|
let id = d
|
||||||
|
.get_item("id")
|
||||||
|
.ok_or_else(|| exceptions::ValueError::py_err("`id` must be specified"))?
|
||||||
|
.extract::<String>()?;
|
||||||
|
let ids = d
|
||||||
|
.get_item("ids")
|
||||||
|
.ok_or_else(|| exceptions::ValueError::py_err("`ids` must be specified"))?
|
||||||
|
.extract::<Vec<u32>>()?;
|
||||||
|
let type_ids = d.get_item("type_ids").map_or_else(
|
||||||
|
|| Ok(vec![None; ids.len()]),
|
||||||
|
|v| v.extract::<Vec<Option<u32>>>(),
|
||||||
|
)?;
|
||||||
|
let tokens = d
|
||||||
|
.get_item("tokens")
|
||||||
|
.ok_or_else(|| exceptions::ValueError::py_err("`tokens` must be specified"))?
|
||||||
|
.extract::<Vec<String>>()?;
|
||||||
|
|
||||||
|
Ok(Self(
|
||||||
|
ToPyResult(SpecialToken::new(id, ids, type_ids, tokens)).into_py()?,
|
||||||
|
))
|
||||||
|
} else {
|
||||||
|
Err(exceptions::TypeError::py_err(
|
||||||
|
"Expected Union[Tuple[str, int], Tuple[int, str], dict]",
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct PyTemplate(Template);
|
||||||
|
|
||||||
|
impl From<PyTemplate> for Template {
|
||||||
|
fn from(v: PyTemplate) -> Self {
|
||||||
|
v.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FromPyObject<'_> for PyTemplate {
|
||||||
|
fn extract(ob: &PyAny) -> PyResult<Self> {
|
||||||
|
if let Ok(s) = ob.extract::<&str>() {
|
||||||
|
Ok(Self(s.into()))
|
||||||
|
} else if let Ok(s) = ob.extract::<Vec<&str>>() {
|
||||||
|
Ok(Self(s.into()))
|
||||||
|
} else {
|
||||||
|
Err(exceptions::TypeError::py_err(
|
||||||
|
"Expected Union[str, List[str]]",
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name=TemplateProcessing)]
|
||||||
|
pub struct PyTemplateProcessing {}
|
||||||
|
#[pymethods]
|
||||||
|
impl PyTemplateProcessing {
|
||||||
|
#[new]
|
||||||
|
#[args(seq_a = "None", seq_b = "None", special_tokens = "None")]
|
||||||
|
fn new(
|
||||||
|
seq_a: Option<PyTemplate>,
|
||||||
|
seq_b: Option<PyTemplate>,
|
||||||
|
special_tokens: Option<Vec<PySpecialToken>>,
|
||||||
|
) -> PyResult<(Self, PyPostProcessor)> {
|
||||||
|
let mut builder = tk::processors::template::TemplateProcessing::builder();
|
||||||
|
|
||||||
|
if let Some(seq) = seq_a {
|
||||||
|
builder.sequence_a(seq);
|
||||||
|
}
|
||||||
|
if let Some(seq) = seq_b {
|
||||||
|
builder.sequence_b(seq);
|
||||||
|
}
|
||||||
|
if let Some(sp) = special_tokens {
|
||||||
|
builder.special_tokens(sp);
|
||||||
|
}
|
||||||
|
let processor = builder.build().map_err(exceptions::ValueError::py_err)?;
|
||||||
|
|
||||||
|
Ok((
|
||||||
|
PyTemplateProcessing {},
|
||||||
|
PyPostProcessor::new(Arc::new(processor.into())),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|||||||
@@ -5,7 +5,13 @@ from ..utils import data_dir, roberta_files
|
|||||||
from tokenizers import Tokenizer
|
from tokenizers import Tokenizer
|
||||||
from tokenizers.models import BPE
|
from tokenizers.models import BPE
|
||||||
from tokenizers.pre_tokenizers import ByteLevel as ByteLevelPreTokenizer
|
from tokenizers.pre_tokenizers import ByteLevel as ByteLevelPreTokenizer
|
||||||
from tokenizers.processors import PostProcessor, BertProcessing, RobertaProcessing, ByteLevel
|
from tokenizers.processors import (
|
||||||
|
PostProcessor,
|
||||||
|
BertProcessing,
|
||||||
|
RobertaProcessing,
|
||||||
|
ByteLevel,
|
||||||
|
TemplateProcessing,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class TestBertProcessing:
|
class TestBertProcessing:
|
||||||
@@ -73,3 +79,78 @@ class TestByteLevelProcessing:
|
|||||||
output = tokenizer.encode("My name is John")
|
output = tokenizer.encode("My name is John")
|
||||||
assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
|
assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
|
||||||
assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15)]
|
assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15)]
|
||||||
|
|
||||||
|
|
||||||
|
class TestTemplateProcessing:
|
||||||
|
def get_bert(self):
|
||||||
|
return TemplateProcessing(
|
||||||
|
seq_a=["[CLS]", "$0", "[SEP]"],
|
||||||
|
seq_b=["$1", "[SEP]"],
|
||||||
|
special_tokens=[("[CLS]", 1), ("[SEP]", 0)],
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_roberta(self):
|
||||||
|
return TemplateProcessing(
|
||||||
|
seq_a="<s> $0 </s>", seq_b="</s> $0 </s>", special_tokens=[("<s>", 0), ("</s>", 1)],
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_t5_squad(self):
|
||||||
|
# >>> from transformers import AutoTokenizer
|
||||||
|
# >>> tok = AutoTokenizer.from_pretrained("t5-small")
|
||||||
|
# >>> tok.tokenize("question: ")
|
||||||
|
# ['▁question', ':']
|
||||||
|
# >>> tok.tokenize("context: ")
|
||||||
|
# ['▁context', ':']
|
||||||
|
# >>> tok.encode("context: ")
|
||||||
|
# [2625, 10]
|
||||||
|
# >>> tok.encode("question: ")
|
||||||
|
# [822, 10]
|
||||||
|
|
||||||
|
return TemplateProcessing(
|
||||||
|
seq_a=["Q", "$0"],
|
||||||
|
seq_b=["C", "$1"],
|
||||||
|
special_tokens=[
|
||||||
|
{
|
||||||
|
"id": "Q",
|
||||||
|
"ids": [2625, 10],
|
||||||
|
"type_ids": [None, None],
|
||||||
|
"tokens": ["_question", ":"],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "C",
|
||||||
|
"ids": [822, 10],
|
||||||
|
"type_ids": [None, None],
|
||||||
|
"tokens": ["_context", ":"],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_instantiate(self):
|
||||||
|
bert = self.get_bert()
|
||||||
|
assert bert is not None
|
||||||
|
assert isinstance(bert, PostProcessor)
|
||||||
|
assert isinstance(bert, TemplateProcessing)
|
||||||
|
assert isinstance(pickle.loads(pickle.dumps(bert)), TemplateProcessing)
|
||||||
|
|
||||||
|
def test_bert_parity(self):
|
||||||
|
tokenizer = Tokenizer(BPE())
|
||||||
|
tokenizer.add_special_tokens(["[SEP]", "[CLS]"])
|
||||||
|
tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
|
||||||
|
tokenizer.post_processor = BertProcessing(("[SEP]", 0), ("[CLS]", 1))
|
||||||
|
|
||||||
|
original = tokenizer.encode("my name", "pair")
|
||||||
|
|
||||||
|
tokenizer.post_processor = self.get_bert()
|
||||||
|
template = tokenizer.encode("my name", "pair")
|
||||||
|
assert original.ids == template.ids
|
||||||
|
|
||||||
|
def test_roberta_parity(self):
|
||||||
|
tokenizer = Tokenizer(BPE())
|
||||||
|
tokenizer.add_special_tokens(["<s>", "</s>"])
|
||||||
|
tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
|
||||||
|
tokenizer.post_processor = RobertaProcessing(("</s>", 1), ("<s>", 0))
|
||||||
|
|
||||||
|
original = tokenizer.encode("my name is john", "pair")
|
||||||
|
tokenizer.post_processor = self.get_roberta()
|
||||||
|
template = tokenizer.encode("my name is john", "pair")
|
||||||
|
assert original.ids == template.ids
|
||||||
|
|||||||
Reference in New Issue
Block a user