mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-09 14:18:30 +00:00
Python - encode & encode batch with add_special_tokens
This commit is contained in:
@@ -7,6 +7,8 @@ a high number of files as it avoids having too many progress bars on screen.
|
|||||||
avoids the unintuitive inclusion of the whitespaces in the produced offsets, even if these
|
avoids the unintuitive inclusion of the whitespaces in the produced offsets, even if these
|
||||||
whitespaces are part of the actual token.
|
whitespaces are part of the actual token.
|
||||||
It has been added to `ByteLevelBPETokenizer` and but it is off by default (`trim_offsets=False`).
|
It has been added to `ByteLevelBPETokenizer` and but it is off by default (`trim_offsets=False`).
|
||||||
|
- `encode` and `encode_batch` no take a new optional argument, specifying whether we should add the
|
||||||
|
special tokens. This stays activated by default.
|
||||||
|
|
||||||
## Fixes:
|
## Fixes:
|
||||||
- Fix some issues with the offsets being wrong with the `ByteLevel` BPE:
|
- Fix some issues with the offsets being wrong with the `ByteLevel` BPE:
|
||||||
|
|||||||
@@ -159,20 +159,34 @@ impl Tokenizer {
|
|||||||
self.tokenizer.with_padding(None);
|
self.tokenizer.with_padding(None);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn encode(&self, sentence: &str, pair: Option<&str>) -> PyResult<Encoding> {
|
#[args(add_special_tokens = true)]
|
||||||
|
fn encode(
|
||||||
|
&self,
|
||||||
|
sentence: &str,
|
||||||
|
pair: Option<&str>,
|
||||||
|
add_special_tokens: bool,
|
||||||
|
) -> PyResult<Encoding> {
|
||||||
ToPyResult(
|
ToPyResult(
|
||||||
self.tokenizer
|
self.tokenizer
|
||||||
.encode(if let Some(pair) = pair {
|
.encode(
|
||||||
|
if let Some(pair) = pair {
|
||||||
tk::tokenizer::EncodeInput::Dual(sentence.to_owned(), pair.to_owned())
|
tk::tokenizer::EncodeInput::Dual(sentence.to_owned(), pair.to_owned())
|
||||||
} else {
|
} else {
|
||||||
tk::tokenizer::EncodeInput::Single(sentence.to_owned())
|
tk::tokenizer::EncodeInput::Single(sentence.to_owned())
|
||||||
})
|
},
|
||||||
|
add_special_tokens,
|
||||||
|
)
|
||||||
.map(Encoding::new),
|
.map(Encoding::new),
|
||||||
)
|
)
|
||||||
.into()
|
.into()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn encode_batch(&self, sentences: &PyList) -> PyResult<Vec<Encoding>> {
|
#[args(add_special_tokens = true)]
|
||||||
|
fn encode_batch(
|
||||||
|
&self,
|
||||||
|
sentences: &PyList,
|
||||||
|
add_special_tokens: bool,
|
||||||
|
) -> PyResult<Vec<Encoding>> {
|
||||||
let inputs = sentences
|
let inputs = sentences
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|item| {
|
.map(|item| {
|
||||||
@@ -190,7 +204,7 @@ impl Tokenizer {
|
|||||||
|
|
||||||
ToPyResult(
|
ToPyResult(
|
||||||
self.tokenizer
|
self.tokenizer
|
||||||
.encode_batch(inputs)
|
.encode_batch(inputs, add_special_tokens)
|
||||||
.map(|encodings| encodings.into_iter().map(Encoding::new).collect()),
|
.map(|encodings| encodings.into_iter().map(Encoding::new).collect()),
|
||||||
)
|
)
|
||||||
.into()
|
.into()
|
||||||
|
|||||||
@@ -244,7 +244,9 @@ class Tokenizer:
|
|||||||
def no_padding(self):
|
def no_padding(self):
|
||||||
""" Disable padding """
|
""" Disable padding """
|
||||||
pass
|
pass
|
||||||
def encode(self, sequence: str, pair: Optional[str] = None) -> Encoding:
|
def encode(
|
||||||
|
self, sequence: str, pair: Optional[str] = None, add_special_tokens: bool = True
|
||||||
|
) -> Encoding:
|
||||||
""" Encode the given sequence
|
""" Encode the given sequence
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -254,11 +256,16 @@ class Tokenizer:
|
|||||||
pair: (`optional`) Optional[str]:
|
pair: (`optional`) Optional[str]:
|
||||||
The optional pair sequence
|
The optional pair sequence
|
||||||
|
|
||||||
|
add_special_tokens: bool:
|
||||||
|
Whether to add the special tokens while encoding
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
An Encoding
|
An Encoding
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def encode_batch(self, sequences: List[Union[str, Tuple[str, str]]]) -> List[Encoding]:
|
def encode_batch(
|
||||||
|
self, sequences: List[Union[str, Tuple[str, str]]], add_special_tokens: bool = True
|
||||||
|
) -> List[Encoding]:
|
||||||
""" Encode the given sequences or pair of sequences
|
""" Encode the given sequences or pair of sequences
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -266,6 +273,9 @@ class Tokenizer:
|
|||||||
A list of sequences or pair of sequences. The list can contain both
|
A list of sequences or pair of sequences. The list can contain both
|
||||||
at the same time.
|
at the same time.
|
||||||
|
|
||||||
|
add_special_tokens: bool:
|
||||||
|
Whether to add the special tokens while encoding
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list of Encoding
|
A list of Encoding
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -125,7 +125,9 @@ class BaseTokenizer:
|
|||||||
"""
|
"""
|
||||||
return self._tokenizer.add_special_tokens(special_tokens)
|
return self._tokenizer.add_special_tokens(special_tokens)
|
||||||
|
|
||||||
def encode(self, sequence: str, pair: Optional[str] = None) -> Encoding:
|
def encode(
|
||||||
|
self, sequence: str, pair: Optional[str] = None, add_special_tokens: bool = True
|
||||||
|
) -> Encoding:
|
||||||
""" Encode the given sequence
|
""" Encode the given sequence
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -135,12 +137,17 @@ class BaseTokenizer:
|
|||||||
pair: (`optional`) Optional[str]:
|
pair: (`optional`) Optional[str]:
|
||||||
The optional pair sequence
|
The optional pair sequence
|
||||||
|
|
||||||
|
add_special_tokens: bool:
|
||||||
|
Whether to add the special tokens while encoding.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
An Encoding
|
An Encoding
|
||||||
"""
|
"""
|
||||||
return self._tokenizer.encode(sequence, pair)
|
return self._tokenizer.encode(sequence, pair, add_special_tokens)
|
||||||
|
|
||||||
def encode_batch(self, sequences: List[Union[str, Tuple[str, str]]]) -> List[Encoding]:
|
def encode_batch(
|
||||||
|
self, sequences: List[Union[str, Tuple[str, str]]], add_special_tokens: bool = True
|
||||||
|
) -> List[Encoding]:
|
||||||
""" Encode the given sequences or pair of sequences
|
""" Encode the given sequences or pair of sequences
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -148,10 +155,13 @@ class BaseTokenizer:
|
|||||||
A list of sequences or pair of sequences. The list can contain both
|
A list of sequences or pair of sequences. The list can contain both
|
||||||
at the same time.
|
at the same time.
|
||||||
|
|
||||||
|
add_special_tokens: bool:
|
||||||
|
Whether to add the special tokens while encoding.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list of Encoding
|
A list of Encoding
|
||||||
"""
|
"""
|
||||||
return self._tokenizer.encode_batch(sequences)
|
return self._tokenizer.encode_batch(sequences, add_special_tokens)
|
||||||
|
|
||||||
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
|
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
|
||||||
""" Decode the given list of ids to a string sequence
|
""" Decode the given list of ids to a string sequence
|
||||||
|
|||||||
Reference in New Issue
Block a user