Python - encode & encode batch with add_special_tokens

This commit is contained in:
Anthony MOI
2020-03-10 16:21:10 -04:00
parent 9e3d69389d
commit 257360acec
4 changed files with 50 additions and 14 deletions

View File

@@ -7,6 +7,8 @@ a high number of files as it avoids having too many progress bars on screen.
avoids the unintuitive inclusion of the whitespaces in the produced offsets, even if these avoids the unintuitive inclusion of the whitespaces in the produced offsets, even if these
whitespaces are part of the actual token. whitespaces are part of the actual token.
It has been added to `ByteLevelBPETokenizer` and but it is off by default (`trim_offsets=False`). It has been added to `ByteLevelBPETokenizer` and but it is off by default (`trim_offsets=False`).
- `encode` and `encode_batch` no take a new optional argument, specifying whether we should add the
special tokens. This stays activated by default.
## Fixes: ## Fixes:
- Fix some issues with the offsets being wrong with the `ByteLevel` BPE: - Fix some issues with the offsets being wrong with the `ByteLevel` BPE:

View File

@@ -159,20 +159,34 @@ impl Tokenizer {
self.tokenizer.with_padding(None); self.tokenizer.with_padding(None);
} }
fn encode(&self, sentence: &str, pair: Option<&str>) -> PyResult<Encoding> { #[args(add_special_tokens = true)]
fn encode(
&self,
sentence: &str,
pair: Option<&str>,
add_special_tokens: bool,
) -> PyResult<Encoding> {
ToPyResult( ToPyResult(
self.tokenizer self.tokenizer
.encode(if let Some(pair) = pair { .encode(
tk::tokenizer::EncodeInput::Dual(sentence.to_owned(), pair.to_owned()) if let Some(pair) = pair {
} else { tk::tokenizer::EncodeInput::Dual(sentence.to_owned(), pair.to_owned())
tk::tokenizer::EncodeInput::Single(sentence.to_owned()) } else {
}) tk::tokenizer::EncodeInput::Single(sentence.to_owned())
},
add_special_tokens,
)
.map(Encoding::new), .map(Encoding::new),
) )
.into() .into()
} }
fn encode_batch(&self, sentences: &PyList) -> PyResult<Vec<Encoding>> { #[args(add_special_tokens = true)]
fn encode_batch(
&self,
sentences: &PyList,
add_special_tokens: bool,
) -> PyResult<Vec<Encoding>> {
let inputs = sentences let inputs = sentences
.into_iter() .into_iter()
.map(|item| { .map(|item| {
@@ -190,7 +204,7 @@ impl Tokenizer {
ToPyResult( ToPyResult(
self.tokenizer self.tokenizer
.encode_batch(inputs) .encode_batch(inputs, add_special_tokens)
.map(|encodings| encodings.into_iter().map(Encoding::new).collect()), .map(|encodings| encodings.into_iter().map(Encoding::new).collect()),
) )
.into() .into()

View File

@@ -244,7 +244,9 @@ class Tokenizer:
def no_padding(self): def no_padding(self):
""" Disable padding """ """ Disable padding """
pass pass
def encode(self, sequence: str, pair: Optional[str] = None) -> Encoding: def encode(
self, sequence: str, pair: Optional[str] = None, add_special_tokens: bool = True
) -> Encoding:
""" Encode the given sequence """ Encode the given sequence
Args: Args:
@@ -254,11 +256,16 @@ class Tokenizer:
pair: (`optional`) Optional[str]: pair: (`optional`) Optional[str]:
The optional pair sequence The optional pair sequence
add_special_tokens: bool:
Whether to add the special tokens while encoding
Returns: Returns:
An Encoding An Encoding
""" """
pass pass
def encode_batch(self, sequences: List[Union[str, Tuple[str, str]]]) -> List[Encoding]: def encode_batch(
self, sequences: List[Union[str, Tuple[str, str]]], add_special_tokens: bool = True
) -> List[Encoding]:
""" Encode the given sequences or pair of sequences """ Encode the given sequences or pair of sequences
Args: Args:
@@ -266,6 +273,9 @@ class Tokenizer:
A list of sequences or pair of sequences. The list can contain both A list of sequences or pair of sequences. The list can contain both
at the same time. at the same time.
add_special_tokens: bool:
Whether to add the special tokens while encoding
Returns: Returns:
A list of Encoding A list of Encoding
""" """

View File

@@ -125,7 +125,9 @@ class BaseTokenizer:
""" """
return self._tokenizer.add_special_tokens(special_tokens) return self._tokenizer.add_special_tokens(special_tokens)
def encode(self, sequence: str, pair: Optional[str] = None) -> Encoding: def encode(
self, sequence: str, pair: Optional[str] = None, add_special_tokens: bool = True
) -> Encoding:
""" Encode the given sequence """ Encode the given sequence
Args: Args:
@@ -135,12 +137,17 @@ class BaseTokenizer:
pair: (`optional`) Optional[str]: pair: (`optional`) Optional[str]:
The optional pair sequence The optional pair sequence
add_special_tokens: bool:
Whether to add the special tokens while encoding.
Returns: Returns:
An Encoding An Encoding
""" """
return self._tokenizer.encode(sequence, pair) return self._tokenizer.encode(sequence, pair, add_special_tokens)
def encode_batch(self, sequences: List[Union[str, Tuple[str, str]]]) -> List[Encoding]: def encode_batch(
self, sequences: List[Union[str, Tuple[str, str]]], add_special_tokens: bool = True
) -> List[Encoding]:
""" Encode the given sequences or pair of sequences """ Encode the given sequences or pair of sequences
Args: Args:
@@ -148,10 +155,13 @@ class BaseTokenizer:
A list of sequences or pair of sequences. The list can contain both A list of sequences or pair of sequences. The list can contain both
at the same time. at the same time.
add_special_tokens: bool:
Whether to add the special tokens while encoding.
Returns: Returns:
A list of Encoding A list of Encoding
""" """
return self._tokenizer.encode_batch(sequences) return self._tokenizer.encode_batch(sequences, add_special_tokens)
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str: def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
""" Decode the given list of ids to a string sequence """ Decode the given list of ids to a string sequence