Adding Sequence for PostProcessor. (#1052)

* Adding `Sequence` for `PostProcessor`.

* Fixing node? Writing in the dark here, don't have Python2.7

* `undefined` is not accepted.

* Other test.
This commit is contained in:
Nicolas Patry
2022-08-25 14:50:06 +02:00
committed by GitHub
parent 37f7bae0f7
commit 06025e4ca1
12 changed files with 344 additions and 7 deletions

View File

@ -5,4 +5,5 @@ PostProcessor = processors.PostProcessor
BertProcessing = processors.BertProcessing
ByteLevel = processors.ByteLevel
RobertaProcessing = processors.RobertaProcessing
Sequence = processors.Sequence
TemplateProcessing = processors.TemplateProcessing

View File

@ -193,6 +193,48 @@ class RobertaProcessing(PostProcessor):
"""
pass
class Sequence(PostProcessor):
"""
Sequence Processor
Args:
processors (:obj:`List[PostProcessor]`)
The processors that need to be chained
"""
def __init__(self, processors):
pass
def num_special_tokens_to_add(self, is_pair):
"""
Return the number of special tokens that would be added for single/pair sentences.
Args:
is_pair (:obj:`bool`):
Whether the input would be a pair of sequences
Returns:
:obj:`int`: The number of tokens to add
"""
pass
def process(self, encoding, pair=None, add_special_tokens=True):
"""
Post-process the given encodings, generating the final one
Args:
encoding (:class:`~tokenizers.Encoding`):
The encoding for the first sequence
pair (:class:`~tokenizers.Encoding`, `optional`):
The encoding for the pair sequence
add_special_tokens (:obj:`bool`):
Whether to add the special tokens
Return:
:class:`~tokenizers.Encoding`: The final encoding
"""
pass
class TemplateProcessing(PostProcessor):
"""
Provides a way to specify templates in order to add the special tokens to each

View File

@ -104,6 +104,7 @@ fn processors(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<processors::PyRobertaProcessing>()?;
m.add_class::<processors::PyByteLevel>()?;
m.add_class::<processors::PyTemplateProcessing>()?;
m.add_class::<processors::PySequence>()?;
Ok(())
}

View File

@ -11,6 +11,7 @@ use serde::{Deserialize, Serialize};
use tk::processors::bert::BertProcessing;
use tk::processors::byte_level::ByteLevel;
use tk::processors::roberta::RobertaProcessing;
use tk::processors::sequence::Sequence;
use tk::processors::template::{SpecialToken, Template};
use tk::processors::PostProcessorWrapper;
use tk::{Encoding, PostProcessor};
@ -50,6 +51,7 @@ impl PyPostProcessor {
PostProcessorWrapper::Template(_) => {
Py::new(py, (PyTemplateProcessing {}, base))?.into_py(py)
}
PostProcessorWrapper::Sequence(_) => Py::new(py, (PySequence {}, base))?.into_py(py),
})
}
}
@ -414,6 +416,37 @@ impl PyTemplateProcessing {
}
}
/// Sequence Processor
///
/// Args:
/// processors (:obj:`List[PostProcessor]`)
/// The processors that need to be chained
#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "Sequence")]
#[pyo3(text_signature = "(self, processors)")]
pub struct PySequence {}
#[pymethods]
impl PySequence {
#[new]
#[args(processors)]
fn new(processors_py: &PyList) -> (Self, PyPostProcessor) {
let mut processors: Vec<PostProcessorWrapper> = Vec::with_capacity(processors_py.len());
for n in processors_py.iter() {
let processor: PyRef<PyPostProcessor> = n.extract().unwrap();
let processor = processor.processor.as_ref();
processors.push(processor.clone());
}
let sequence_processor = Sequence::new(processors);
(
PySequence {},
PyPostProcessor::new(Arc::new(PostProcessorWrapper::Sequence(sequence_processor))),
)
}
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
PyTuple::new(py, &[PyList::empty(py)])
}
}
#[cfg(test)]
mod test {
use std::sync::Arc;

View File

@ -13,6 +13,7 @@ from tokenizers.processors import (
RobertaProcessing,
ByteLevel,
TemplateProcessing,
Sequence,
)
@ -179,3 +180,49 @@ class TestTemplateProcessing:
tokenizer.post_processor = self.get_roberta()
template = tokenizer.encode("my name is john", "pair")
assert original.ids == template.ids
class TestSequenceProcessing:
def test_sequence_processing(self):
assert Sequence([]) is not None
assert Sequence([ByteLevel()]) is not None
assert isinstance(Sequence([]), PostProcessor)
assert isinstance(Sequence([]), Sequence)
serialized = pickle.dumps(Sequence([]))
assert isinstance(pickle.loads(serialized), Sequence)
def test_post_process(self):
byte_level = ByteLevel(trim_offsets=True)
template = TemplateProcessing(
single=["[CLS]", "$0", "[SEP]"],
pair=["[CLS]:0", "$A", "[SEP]:0", "$B:1", "[SEP]:1"],
special_tokens=[("[CLS]", 1), ("[SEP]", 0)],
)
tokenizer = Tokenizer(BPE())
tokenizer.add_special_tokens(["[SEP]", "[CLS]"])
tokenizer.add_tokens(["my", "name", "is", "Ġjohn", "pair"])
tokenizer.post_processor = template
# Before the sequence
original = tokenizer.encode("my name is Ġjohn")
assert original.ids == [1, 2, 3, 4, 5, 0]
assert original.type_ids == [0, 0, 0, 0, 0, 0]
assert original.offsets == [(0, 0), (0, 2), (3, 7), (8, 10), (11, 16), (0, 0)]
pair = tokenizer.encode("my name is Ġjohn", "pair")
# assert pair.ids == [1, 2, 3, 4, 5, 0, 6, 0]
assert pair.type_ids == [0, 0, 0, 0, 0, 0, 1, 1]
assert pair.offsets == [(0, 0), (0, 2), (3, 7), (8, 10), (11, 16), (0, 0), (0, 4), (0, 0)]
processor = Sequence([byte_level, template])
tokenizer.post_processor = processor
original = tokenizer.encode("my name is Ġjohn")
assert original.ids == [1, 2, 3, 4, 5, 0]
assert original.type_ids == [0, 0, 0, 0, 0, 0]
# Offsets ARE trimmed
assert original.offsets == [(0, 0), (0, 2), (3, 7), (8, 10), (12, 16), (0, 0)]
pair = tokenizer.encode("my name is Ġjohn", "pair")
# assert pair.ids == [1, 2, 3, 4, 5, 0, 6, 0]
assert pair.type_ids == [0, 0, 0, 0, 0, 0, 1, 1]
assert pair.offsets == [(0, 0), (0, 2), (3, 7), (8, 10), (12, 16), (0, 0), (0, 4), (0, 0)]