mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-05 12:18:20 +00:00
Python - Bindings for TemplateProcessing
This commit is contained in:
@@ -4,3 +4,4 @@ PostProcessor = processors.PostProcessor
|
||||
BertProcessing = processors.BertProcessing
|
||||
RobertaProcessing = processors.RobertaProcessing
|
||||
ByteLevel = processors.ByteLevel
|
||||
TemplateProcessing = processors.TemplateProcessing
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from typing import Tuple
|
||||
from typing import Tuple, Union, List
|
||||
|
||||
class PostProcessor:
|
||||
""" Base class for all post-processors
|
||||
@@ -89,7 +89,7 @@ class ByteLevel(PostProcessor):
|
||||
want the offsets to include these whitespaces, then this PostProcessor must be used.
|
||||
"""
|
||||
|
||||
def __init(self, trim_offsets: bool = True) -> None:
|
||||
def __init__(self, trim_offsets: bool = True) -> None:
|
||||
""" Instantiate a new ByteLevel
|
||||
|
||||
Args:
|
||||
@@ -97,3 +97,67 @@ class ByteLevel(PostProcessor):
|
||||
Whether to trim the whitespaces from the produced offsets.
|
||||
"""
|
||||
pass
|
||||
|
||||
Template = Union[str, List[str]]
|
||||
Tokens = List[Union[Tuple[int, str], Tuple[str, int], dict]]
|
||||
|
||||
class TemplateProcessing(PostProcessor):
|
||||
""" TemplateProcessing
|
||||
|
||||
Provides a way to specify templates in order to add the special tokens to each
|
||||
input sequence as relevant.
|
||||
|
||||
Let's take `BERT` tokenizer as an example. It uses two special tokens, used to
|
||||
delimitate each sequence. `[CLS]` is always used at the beginning of the first
|
||||
sequence, and `[SEP]` is added at the end of both the first, and the pair
|
||||
sequences. The final result looks like this:
|
||||
- Single sequence: `[CLS] Hello there [SEP]`
|
||||
- Pair sequences: `[CLS] My name is Anthony [SEP] What is my name? [SEP]`
|
||||
|
||||
You can achieve such behavior using a TemplateProcessing:
|
||||
```
|
||||
TemplateProcessing(
|
||||
seq_a="[CLS] $0 [SEP]",
|
||||
seq_b="$1 [SEP]",
|
||||
special_tokens=[("[CLS]", 1), ("[SEP]", 0)],
|
||||
)
|
||||
```
|
||||
|
||||
In this example, $0 and $1 both represent the input sequences. The number in this
|
||||
identifier is actually the default type_id that will be used for each sequence. So,
|
||||
in this case, the first sequence will use 0, while the pair sequence will use 1.
|
||||
|
||||
Note that we are saying the "default" type_id because each SpecialToken can define
|
||||
its own type_id which would override the provided default.
|
||||
"""
|
||||
|
||||
def __init__(self, seq_a: Template, seq_b: Template, special_tokens: Tokens) -> None:
|
||||
""" Instantiate a new TemplateProcessing
|
||||
|
||||
Args:
|
||||
seq_a: Template
|
||||
The template for the first sequence.
|
||||
|
||||
seq_b: Template:
|
||||
The template for the pair sequence.
|
||||
|
||||
special_tokens: Tokens:
|
||||
The list of special tokens used in each sequences
|
||||
|
||||
Template: Union[str, List[str]]:
|
||||
- If a `str` is provided, the whitespace is used as delimiter between tokens
|
||||
- If a `List[str]` is provided, a list of tokens
|
||||
|
||||
Tokens: List[Union[Tuple[int, str], Tuple[str, int], dict]]:
|
||||
- A Tuple with both a token and its associated ID, in any order
|
||||
- A dict with the following keys:
|
||||
- "id": str => The special token id, as specified in the Template
|
||||
- "ids": List[int] => The associated IDs
|
||||
- "tokens": List[str] => The associated tokens
|
||||
- "type_ids": Optional[List[Optional[int]]] => If specified, a list of optional
|
||||
type_ids. In the `type_id` is not specified, the one from the input sequence
|
||||
will be used.
|
||||
The given dict expects the provided `ids`, `tokens` and `type_ids` lists to have
|
||||
the same length.
|
||||
"""
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user