Python - Bindings for TemplateProcessing

This commit is contained in:
Anthony MOI
2020-09-09 17:13:08 -04:00
committed by Anthony MOI
parent c156ae3a83
commit 337fe72b13
5 changed files with 255 additions and 3 deletions

View File

@@ -4,3 +4,4 @@ PostProcessor = processors.PostProcessor
BertProcessing = processors.BertProcessing
RobertaProcessing = processors.RobertaProcessing
ByteLevel = processors.ByteLevel
TemplateProcessing = processors.TemplateProcessing

View File

@@ -1,4 +1,4 @@
from typing import Tuple
from typing import Tuple, Union, List
class PostProcessor:
""" Base class for all post-processors
@@ -89,7 +89,7 @@ class ByteLevel(PostProcessor):
want the offsets to include these whitespaces, then this PostProcessor must be used.
"""
def __init(self, trim_offsets: bool = True) -> None:
def __init__(self, trim_offsets: bool = True) -> None:
""" Instantiate a new ByteLevel
Args:
@@ -97,3 +97,67 @@ class ByteLevel(PostProcessor):
Whether to trim the whitespaces from the produced offsets.
"""
pass
Template = Union[str, List[str]]
Tokens = List[Union[Tuple[int, str], Tuple[str, int], dict]]
class TemplateProcessing(PostProcessor):
""" TemplateProcessing
Provides a way to specify templates in order to add the special tokens to each
input sequence as relevant.
Let's take `BERT` tokenizer as an example. It uses two special tokens, used to
delimitate each sequence. `[CLS]` is always used at the beginning of the first
sequence, and `[SEP]` is added at the end of both the first, and the pair
sequences. The final result looks like this:
- Single sequence: `[CLS] Hello there [SEP]`
- Pair sequences: `[CLS] My name is Anthony [SEP] What is my name? [SEP]`
You can achieve such behavior using a TemplateProcessing:
```
TemplateProcessing(
seq_a="[CLS] $0 [SEP]",
seq_b="$1 [SEP]",
special_tokens=[("[CLS]", 1), ("[SEP]", 0)],
)
```
In this example, $0 and $1 both represent the input sequences. The number in this
identifier is actually the default type_id that will be used for each sequence. So,
in this case, the first sequence will use 0, while the pair sequence will use 1.
Note that we are saying the "default" type_id because each SpecialToken can define
its own type_id which would override the provided default.
"""
def __init__(self, seq_a: Template, seq_b: Template, special_tokens: Tokens) -> None:
""" Instantiate a new TemplateProcessing
Args:
seq_a: Template
The template for the first sequence.
seq_b: Template:
The template for the pair sequence.
special_tokens: Tokens:
The list of special tokens used in each sequences
Template: Union[str, List[str]]:
- If a `str` is provided, the whitespace is used as delimiter between tokens
- If a `List[str]` is provided, a list of tokens
Tokens: List[Union[Tuple[int, str], Tuple[str, int], dict]]:
- A Tuple with both a token and its associated ID, in any order
- A dict with the following keys:
- "id": str => The special token id, as specified in the Template
- "ids": List[int] => The associated IDs
- "tokens": List[str] => The associated tokens
- "type_ids": Optional[List[Optional[int]]] => If specified, a list of optional
type_ids. In the `type_id` is not specified, the one from the input sequence
will be used.
The given dict expects the provided `ids`, `tokens` and `type_ids` lists to have
the same length.
"""
pass