Python - Add bindings for new AddedToken options

This commit is contained in:
Anthony MOI
2020-03-23 16:39:39 -04:00
parent b1998da070
commit c65d53892d
11 changed files with 169 additions and 47 deletions

View File

@ -85,6 +85,7 @@ fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
#[pymodule]
fn tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<tokenizer::Tokenizer>()?;
m.add_class::<tokenizer::AddedToken>()?;
m.add_class::<encoding::Encoding>()?;
m.add_wrapped(wrap_pymodule!(models))?;
m.add_wrapped(wrap_pymodule!(pre_tokenizers))?;

View File

@ -18,6 +18,34 @@ use tk::tokenizer::{
PaddingDirection, PaddingParams, PaddingStrategy, TruncationParams, TruncationStrategy,
};
#[pyclass(dict)]
pub struct AddedToken {
pub token: tk::tokenizer::AddedToken,
}
#[pymethods]
impl AddedToken {
#[new]
#[args(kwargs = "**")]
fn new(obj: &PyRawObject, content: &str, kwargs: Option<&PyDict>) -> PyResult<()> {
let mut token = tk::tokenizer::AddedToken::from(content.to_owned());
if let Some(kwargs) = kwargs {
for (key, value) in kwargs {
let key: &str = key.extract()?;
match key {
"single_word" => token = token.single_word(value.extract()?),
"lstrip" => token = token.lstrip(value.extract()?),
"rstrip" => token = token.rstrip(value.extract()?),
_ => println!("Ignored unknown kwarg option {}", key),
}
}
}
obj.init({ AddedToken { token } });
Ok(())
}
}
#[pyclass(dict)]
pub struct Tokenizer {
tokenizer: tk::tokenizer::Tokenizer,
@ -256,14 +284,11 @@ impl Tokenizer {
content,
..Default::default()
})
} else if let Ok((content, single_word)) = token.extract::<(String, bool)>() {
Ok(tk::tokenizer::AddedToken {
content,
single_word,
})
} else if let Ok(token) = token.cast_as::<AddedToken>() {
Ok(token.token.clone())
} else {
Err(exceptions::Exception::py_err(
"Input must be a list[str] or list[(str, bool)]",
"Input must be a List[Union[str, AddedToken]]",
))
}
})
@ -272,7 +297,25 @@ impl Tokenizer {
Ok(self.tokenizer.add_tokens(&tokens))
}
fn add_special_tokens(&mut self, tokens: Vec<&str>) -> PyResult<usize> {
fn add_special_tokens(&mut self, tokens: &PyList) -> PyResult<usize> {
let tokens = tokens
.into_iter()
.map(|token| {
if let Ok(content) = token.extract::<String>() {
Ok(tk::tokenizer::AddedToken {
content,
..Default::default()
})
} else if let Ok(token) = token.cast_as::<AddedToken>() {
Ok(token.token.clone())
} else {
Err(exceptions::Exception::py_err(
"Input must be a List[Union[str, AddedToken]]",
))
}
})
.collect::<PyResult<Vec<_>>>()?;
Ok(self.tokenizer.add_special_tokens(&tokens))
}

View File

@ -1,6 +1,8 @@
extern crate tokenizers as tk;
use super::utils::Container;
use crate::tokenizer::AddedToken;
use pyo3::exceptions;
use pyo3::prelude::*;
use pyo3::types::*;
@ -28,7 +30,27 @@ impl BpeTrainer {
"vocab_size" => builder = builder.vocab_size(val.extract()?),
"min_frequency" => builder = builder.min_frequency(val.extract()?),
"show_progress" => builder = builder.show_progress(val.extract()?),
"special_tokens" => builder = builder.special_tokens(val.extract()?),
"special_tokens" => {
builder = builder.special_tokens(
val.cast_as::<PyList>()?
.into_iter()
.map(|token| {
if let Ok(content) = token.extract::<String>() {
Ok(tk::tokenizer::AddedToken {
content,
..Default::default()
})
} else if let Ok(token) = token.cast_as::<AddedToken>() {
Ok(token.token.clone())
} else {
Err(exceptions::Exception::py_err(
"special_tokens must be a List[Union[str, AddedToken]]",
))
}
})
.collect::<PyResult<Vec<_>>>()?,
);
}
"limit_alphabet" => builder = builder.limit_alphabet(val.extract()?),
"initial_alphabet" => {
let alphabet: Vec<String> = val.extract()?;
@ -74,7 +96,27 @@ impl WordPieceTrainer {
"vocab_size" => builder = builder.vocab_size(val.extract()?),
"min_frequency" => builder = builder.min_frequency(val.extract()?),
"show_progress" => builder = builder.show_progress(val.extract()?),
"special_tokens" => builder = builder.special_tokens(val.extract()?),
"special_tokens" => {
builder = builder.special_tokens(
val.cast_as::<PyList>()?
.into_iter()
.map(|token| {
if let Ok(content) = token.extract::<String>() {
Ok(tk::tokenizer::AddedToken {
content,
..Default::default()
})
} else if let Ok(token) = token.cast_as::<AddedToken>() {
Ok(token.token.clone())
} else {
Err(exceptions::Exception::py_err(
"special_tokens must be a List[Union[str, AddedToken]]",
))
}
})
.collect::<PyResult<Vec<_>>>()?,
);
}
"limit_alphabet" => builder = builder.limit_alphabet(val.extract()?),
"initial_alphabet" => {
let alphabet: Vec<String> = val.extract()?;

View File

@ -1,6 +1,6 @@
__version__ = "0.6.0"
from .tokenizers import Tokenizer, Encoding
from .tokenizers import Tokenizer, Encoding, AddedToken
from .tokenizers import decoders
from .tokenizers import models
from .tokenizers import normalizers

View File

@ -91,6 +91,37 @@ class Encoding:
"""
pass
class AddedToken:
""" AddedToken represents a token to be added to a Tokenizer
An AddedToken can have special options defining the way it should behave.
"""
def __new__(
cls, content: str, single_word: bool = False, lstrip: bool = False, rstrip: bool = False
) -> AddedToken:
""" Instantiate a new AddedToken
Args:
content: str:
The content of the token
single_word: bool
Whether this token should only match against single word. If True,
this token will never match inside of a word.
lstrip: bool
Whether this token should strip all potential whitespaces on the left side.
If True, this token will greedily match any whitespace on the left and then strip
them out.
rstrip: bool
Whether this token should strip all potential whitespaces on the right side.
If True, this token will greedily match any whitespace on the right and then strip
them out.
"""
pass
class Tokenizer:
""" Tokenizer
@ -320,29 +351,28 @@ class Tokenizer:
The corresponding string if it exists, None otherwise
"""
pass
def add_tokens(self, tokens: List[Union[str, Tuple[str, bool]]]) -> int:
def add_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
""" Add the given tokens to the vocabulary
Args:
tokens: List[Union[str, Tuple[str, bool]]]:
tokens: List[Union[str, AddedToken]]:
A list of tokens to add to the vocabulary. Each token can either be
a string, or a tuple with a string representing the token, and a boolean
option representing whether to match on single words only.
If the boolean is not included, it defaults to False
a string, or an instance of AddedToken
Returns:
The number of tokens that were added to the vocabulary
"""
pass
def add_special_tokens(self, tokens: List[str]) -> int:
def add_special_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
""" Add the given special tokens to the vocabulary, and treat them as special tokens.
The special tokens will never be processed by the model, and will be
removed while decoding.
Args:
tokens: List[str]:
The list of special tokens to add
tokens: List[Union[str, AddedToken]]:
The list of special tokens to add. Each token can either be a string
or an instance of AddedToken
Returns:
The number of tokens that were added to the vocabulary

View File

@ -95,30 +95,29 @@ class BaseTokenizer:
""" Disable truncation """
return self._tokenizer.no_truncation()
def add_tokens(self, tokens: List[Union[str, Tuple[str, bool]]]) -> int:
def add_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
""" Add the given tokens to the vocabulary
Args:
tokens: List[Union[str, Tuple[str, bool]]]:
tokens: List[Union[str, AddedToken]]:
A list of tokens to add to the vocabulary. Each token can either be
a string, or a tuple with a string representing the token, and a boolean
option representing whether to match on single words only.
If the boolean is not included, it defaults to False
a string, or an instance of AddedToken
Returns:
The number of tokens that were added to the vocabulary
"""
return self._tokenizer.add_tokens(tokens)
def add_special_tokens(self, special_tokens: List[str]) -> int:
def add_special_tokens(self, special_tokens: List[Union[str, AddedToken]]) -> int:
""" Add the given special tokens to the vocabulary, and treat them as special tokens.
The special tokens will never be processed by the model, and will be
removed while decoding.
Args:
tokens: List[str]:
The list of special tokens to add
tokens: List[Union[str, AddedToken]]:
A list of special tokens to add to the vocabulary. Each token can either be
a string, or an instance of AddedToken
Returns:
The number of tokens that were added to the vocabulary

View File

@ -1,4 +1,4 @@
from tokenizers import Tokenizer, decoders, trainers
from tokenizers import Tokenizer, AddedToken, decoders, trainers
from tokenizers.models import WordPiece
from tokenizers.normalizers import BertNormalizer
from tokenizers.pre_tokenizers import BertPreTokenizer
@ -15,11 +15,11 @@ class BertWordPieceTokenizer(BaseTokenizer):
self,
vocab_file: Optional[str] = None,
add_special_tokens: bool = True,
unk_token: str = "[UNK]",
sep_token: str = "[SEP]",
cls_token: str = "[CLS]",
pad_token: str = "[PAD]",
mask_token: str = "[MASK]",
unk_token: Union[str, AddedToken] = "[UNK]",
sep_token: Union[str, AddedToken] = "[SEP]",
cls_token: Union[str, AddedToken] = "[CLS]",
pad_token: Union[str, AddedToken] = "[PAD]",
mask_token: Union[str, AddedToken] = "[MASK]",
clean_text: bool = True,
handle_chinese_chars: bool = True,
strip_accents: bool = True,
@ -89,7 +89,13 @@ class BertWordPieceTokenizer(BaseTokenizer):
min_frequency: int = 2,
limit_alphabet: int = 1000,
initial_alphabet: List[str] = [],
special_tokens: List[str] = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
special_tokens: List[Union[str, AddedToken]] = [
"[PAD]",
"[UNK]",
"[CLS]",
"[SEP]",
"[MASK]",
],
show_progress: bool = True,
wordpieces_prefix: str = "##",
):

View File

@ -1,4 +1,4 @@
from tokenizers import Tokenizer, pre_tokenizers, decoders, trainers, processors
from tokenizers import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers, processors
from tokenizers.models import BPE
from tokenizers.normalizers import unicode_normalizer_from_str, Lowercase, Sequence
from .base_tokenizer import BaseTokenizer
@ -76,7 +76,7 @@ class ByteLevelBPETokenizer(BaseTokenizer):
vocab_size: int = 30000,
min_frequency: int = 2,
show_progress: bool = True,
special_tokens: List[str] = [],
special_tokens: List[Union[str, AddedToken]] = [],
):
""" Train the model using the given files """

View File

@ -1,4 +1,4 @@
from .. import Tokenizer, pre_tokenizers, decoders, trainers
from .. import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers
from ..models import BPE
from ..normalizers import Sequence, Lowercase, unicode_normalizer_from_str
from .base_tokenizer import BaseTokenizer
@ -16,8 +16,8 @@ class CharBPETokenizer(BaseTokenizer):
self,
vocab_file: Optional[str] = None,
merges_file: Optional[str] = None,
unk_token: Optional[str] = "<unk>",
suffix: Optional[str] = "</w>",
unk_token: Union[str, AddedToken] = "<unk>",
suffix: str = "</w>",
dropout: Optional[float] = None,
lowercase: bool = False,
unicode_normalizer: Optional[str] = None,
@ -73,7 +73,7 @@ class CharBPETokenizer(BaseTokenizer):
files: Union[str, List[str]],
vocab_size: int = 30000,
min_frequency: int = 2,
special_tokens: List[str] = ["<unk>"],
special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
limit_alphabet: int = 1000,
initial_alphabet: List[str] = [],
suffix: Optional[str] = "</w>",

View File

@ -1,4 +1,4 @@
from tokenizers import Tokenizer, pre_tokenizers, decoders, trainers
from tokenizers import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers
from tokenizers.models import BPE
from tokenizers.normalizers import NFKC
from .base_tokenizer import BaseTokenizer
@ -16,7 +16,7 @@ class SentencePieceBPETokenizer(BaseTokenizer):
self,
vocab_file: Optional[str] = None,
merges_file: Optional[str] = None,
unk_token: str = "<unk>",
unk_token: Union[str, AddedToken] = "<unk>",
replacement: str = "",
add_prefix_space: bool = True,
dropout: Optional[float] = None,
@ -54,7 +54,7 @@ class SentencePieceBPETokenizer(BaseTokenizer):
files: Union[str, List[str]],
vocab_size: int = 30000,
min_frequency: int = 2,
special_tokens: List[str] = ["<unk>"],
special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
limit_alphabet: int = 1000,
initial_alphabet: List[str] = [],
show_progress: bool = True,

View File

@ -1,4 +1,5 @@
from typing import Optional, List
from .. import AddedToken
from typing import Optional, List, Union
class Trainer:
""" Base class for all trainers
@ -18,7 +19,7 @@ class BpeTrainer(Trainer):
vocab_size: int = 30000,
min_frequency: int = 0,
show_progress: bool = True,
special_tokens: List[str] = [],
special_tokens: List[Union[str, AddedToken]] = [],
limit_alphabet: Optional[int] = None,
initial_alphabet: List[str] = [],
continuing_subword_prefix: Optional[str] = None,
@ -36,7 +37,7 @@ class BpeTrainer(Trainer):
show_progress: boolean:
Whether to show progress bars while training.
special_tokens: List[str]:
special_tokens: List[Union[str, AddedToken]]:
A list of special tokens the model should know of.
limit_alphabet: unsigned int:
@ -70,7 +71,7 @@ class WordPieceTrainer(Trainer):
vocab_size: int = 30000,
min_frequency: int = 0,
show_progress: bool = True,
special_tokens: List[str] = [],
special_tokens: List[Union[str, AddedToken]] = [],
limit_alphabet: Optional[int] = None,
initial_alphabet: List[str] = [],
continuing_subword_prefix: Optional[str] = "##",
@ -88,7 +89,7 @@ class WordPieceTrainer(Trainer):
show_progress: boolean:
Whether to show progress bars while training.
special_tokens: List[str]:
special_tokens: List[Union[str, AddedToken]]:
A list of special tokens the model should know of.
limit_alphabet: unsigned int: