mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Fix clippy warnings
This commit is contained in:
@ -57,11 +57,9 @@ impl Decoder for PyDecoder {
|
||||
#[pymethods]
|
||||
impl PyDecoder {
|
||||
#[staticmethod]
|
||||
fn custom(decoder: PyObject) -> PyResult<Self> {
|
||||
let decoder = PyDecoderWrapper::Custom(
|
||||
CustomDecoder::new(decoder).map(|d| Arc::new(RwLock::new(d)))?,
|
||||
);
|
||||
Ok(PyDecoder::new(decoder))
|
||||
fn custom(decoder: PyObject) -> Self {
|
||||
let decoder = PyDecoderWrapper::Custom(Arc::new(RwLock::new(CustomDecoder::new(decoder))));
|
||||
PyDecoder::new(decoder)
|
||||
}
|
||||
|
||||
fn __getstate__(&self, py: Python) -> PyResult<PyObject> {
|
||||
@ -147,8 +145,8 @@ pub struct PyByteLevelDec {}
|
||||
#[pymethods]
|
||||
impl PyByteLevelDec {
|
||||
#[new]
|
||||
fn new() -> PyResult<(Self, PyDecoder)> {
|
||||
Ok((PyByteLevelDec {}, ByteLevel::default().into()))
|
||||
fn new() -> (Self, PyDecoder) {
|
||||
(PyByteLevelDec {}, ByteLevel::default().into())
|
||||
}
|
||||
}
|
||||
|
||||
@ -188,8 +186,8 @@ impl PyWordPieceDec {
|
||||
|
||||
#[new]
|
||||
#[args(prefix = "String::from(\"##\")", cleanup = "true")]
|
||||
fn new(prefix: String, cleanup: bool) -> PyResult<(Self, PyDecoder)> {
|
||||
Ok((PyWordPieceDec {}, WordPiece::new(prefix, cleanup).into()))
|
||||
fn new(prefix: String, cleanup: bool) -> (Self, PyDecoder) {
|
||||
(PyWordPieceDec {}, WordPiece::new(prefix, cleanup).into())
|
||||
}
|
||||
}
|
||||
|
||||
@ -230,11 +228,11 @@ impl PyMetaspaceDec {
|
||||
|
||||
#[new]
|
||||
#[args(replacement = "PyChar('▁')", add_prefix_space = "true")]
|
||||
fn new(replacement: PyChar, add_prefix_space: bool) -> PyResult<(Self, PyDecoder)> {
|
||||
Ok((
|
||||
fn new(replacement: PyChar, add_prefix_space: bool) -> (Self, PyDecoder) {
|
||||
(
|
||||
PyMetaspaceDec {},
|
||||
Metaspace::new(replacement.0, add_prefix_space).into(),
|
||||
))
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@ -261,8 +259,8 @@ impl PyBPEDecoder {
|
||||
|
||||
#[new]
|
||||
#[args(suffix = "String::from(\"</w>\")")]
|
||||
fn new(suffix: String) -> PyResult<(Self, PyDecoder)> {
|
||||
Ok((PyBPEDecoder {}, BPEDecoder::new(suffix).into()))
|
||||
fn new(suffix: String) -> (Self, PyDecoder) {
|
||||
(PyBPEDecoder {}, BPEDecoder::new(suffix).into())
|
||||
}
|
||||
}
|
||||
|
||||
@ -272,8 +270,8 @@ pub(crate) struct CustomDecoder {
|
||||
}
|
||||
|
||||
impl CustomDecoder {
|
||||
pub(crate) fn new(inner: PyObject) -> PyResult<Self> {
|
||||
Ok(CustomDecoder { inner })
|
||||
pub(crate) fn new(inner: PyObject) -> Self {
|
||||
CustomDecoder { inner }
|
||||
}
|
||||
}
|
||||
|
||||
@ -387,8 +385,7 @@ mod test {
|
||||
let obj: PyObject = Py::new(py, py_msp).unwrap().into_py(py);
|
||||
obj
|
||||
});
|
||||
let py_seq =
|
||||
PyDecoderWrapper::Custom(Arc::new(RwLock::new(CustomDecoder::new(obj).unwrap())));
|
||||
let py_seq = PyDecoderWrapper::Custom(Arc::new(RwLock::new(CustomDecoder::new(obj))));
|
||||
assert!(serde_json::to_string(&py_seq).is_err());
|
||||
}
|
||||
}
|
||||
|
@ -41,10 +41,10 @@ impl PySequenceProtocol for PyEncoding {
|
||||
#[pymethods]
|
||||
impl PyEncoding {
|
||||
#[new]
|
||||
fn new() -> PyResult<Self> {
|
||||
Ok(Self {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
encoding: tk::tokenizer::Encoding::default(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn __getstate__(&self, py: Python) -> PyResult<PyObject> {
|
||||
@ -441,8 +441,7 @@ impl PyEncoding {
|
||||
/// The length of previous content to be included in each overflowing piece
|
||||
#[args(stride = "0")]
|
||||
#[text_signature = "(self, max_length, stride=0)"]
|
||||
fn truncate(&mut self, max_length: usize, stride: usize) -> PyResult<()> {
|
||||
fn truncate(&mut self, max_length: usize, stride: usize) {
|
||||
self.encoding.truncate(max_length, stride);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
@ -91,12 +91,12 @@ where
|
||||
#[pymethods]
|
||||
impl PyModel {
|
||||
#[new]
|
||||
fn __new__() -> PyResult<Self> {
|
||||
fn __new__() -> Self {
|
||||
// Instantiate a default empty model. This doesn't really make sense, but we need
|
||||
// to be able to instantiate an empty model for pickle capabilities.
|
||||
Ok(PyModel {
|
||||
PyModel {
|
||||
model: Arc::new(RwLock::new(BPE::default().into())),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn __getstate__(&self, py: Python) -> PyResult<PyObject> {
|
||||
|
@ -102,10 +102,10 @@ impl Normalizer for PyNormalizer {
|
||||
#[pymethods]
|
||||
impl PyNormalizer {
|
||||
#[staticmethod]
|
||||
fn custom(obj: PyObject) -> PyResult<Self> {
|
||||
Ok(Self {
|
||||
fn custom(obj: PyObject) -> Self {
|
||||
Self {
|
||||
normalizer: PyNormalizerWrapper::Custom(CustomNormalizer::new(obj)).into(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn __getstate__(&self, py: Python) -> PyResult<PyObject> {
|
||||
@ -279,10 +279,10 @@ impl PyBertNormalizer {
|
||||
handle_chinese_chars: bool,
|
||||
strip_accents: Option<bool>,
|
||||
lowercase: bool,
|
||||
) -> PyResult<(Self, PyNormalizer)> {
|
||||
) -> (Self, PyNormalizer) {
|
||||
let normalizer =
|
||||
BertNormalizer::new(clean_text, handle_chinese_chars, strip_accents, lowercase);
|
||||
Ok((PyBertNormalizer {}, normalizer.into()))
|
||||
(PyBertNormalizer {}, normalizer.into())
|
||||
}
|
||||
}
|
||||
|
||||
@ -293,8 +293,8 @@ pub struct PyNFD {}
|
||||
#[pymethods]
|
||||
impl PyNFD {
|
||||
#[new]
|
||||
fn new() -> PyResult<(Self, PyNormalizer)> {
|
||||
Ok((PyNFD {}, PyNormalizer::new(NFD.into())))
|
||||
fn new() -> (Self, PyNormalizer) {
|
||||
(PyNFD {}, PyNormalizer::new(NFD.into()))
|
||||
}
|
||||
}
|
||||
|
||||
@ -305,8 +305,8 @@ pub struct PyNFKD {}
|
||||
#[pymethods]
|
||||
impl PyNFKD {
|
||||
#[new]
|
||||
fn new() -> PyResult<(Self, PyNormalizer)> {
|
||||
Ok((PyNFKD {}, NFKD.into()))
|
||||
fn new() -> (Self, PyNormalizer) {
|
||||
(PyNFKD {}, NFKD.into())
|
||||
}
|
||||
}
|
||||
|
||||
@ -317,8 +317,8 @@ pub struct PyNFC {}
|
||||
#[pymethods]
|
||||
impl PyNFC {
|
||||
#[new]
|
||||
fn new() -> PyResult<(Self, PyNormalizer)> {
|
||||
Ok((PyNFC {}, NFC.into()))
|
||||
fn new() -> (Self, PyNormalizer) {
|
||||
(PyNFC {}, NFC.into())
|
||||
}
|
||||
}
|
||||
|
||||
@ -329,8 +329,8 @@ pub struct PyNFKC {}
|
||||
#[pymethods]
|
||||
impl PyNFKC {
|
||||
#[new]
|
||||
fn new() -> PyResult<(Self, PyNormalizer)> {
|
||||
Ok((PyNFKC {}, NFKC.into()))
|
||||
fn new() -> (Self, PyNormalizer) {
|
||||
(PyNFKC {}, NFKC.into())
|
||||
}
|
||||
}
|
||||
|
||||
@ -360,8 +360,8 @@ impl PySequence {
|
||||
))
|
||||
}
|
||||
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<&'p PyTuple> {
|
||||
Ok(PyTuple::new(py, &[PyList::empty(py)]))
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||
PyTuple::new(py, &[PyList::empty(py)])
|
||||
}
|
||||
}
|
||||
|
||||
@ -379,8 +379,8 @@ pub struct PyLowercase {}
|
||||
#[pymethods]
|
||||
impl PyLowercase {
|
||||
#[new]
|
||||
fn new() -> PyResult<(Self, PyNormalizer)> {
|
||||
Ok((PyLowercase {}, Lowercase.into()))
|
||||
fn new() -> (Self, PyNormalizer) {
|
||||
(PyLowercase {}, Lowercase.into())
|
||||
}
|
||||
}
|
||||
|
||||
@ -412,8 +412,8 @@ impl PyStrip {
|
||||
|
||||
#[new]
|
||||
#[args(left = "true", right = "true")]
|
||||
fn new(left: bool, right: bool) -> PyResult<(Self, PyNormalizer)> {
|
||||
Ok((PyStrip {}, Strip::new(left, right).into()))
|
||||
fn new(left: bool, right: bool) -> (Self, PyNormalizer) {
|
||||
(PyStrip {}, Strip::new(left, right).into())
|
||||
}
|
||||
}
|
||||
|
||||
@ -424,8 +424,8 @@ pub struct PyStripAccents {}
|
||||
#[pymethods]
|
||||
impl PyStripAccents {
|
||||
#[new]
|
||||
fn new() -> PyResult<(Self, PyNormalizer)> {
|
||||
Ok((PyStripAccents {}, StripAccents.into()))
|
||||
fn new() -> (Self, PyNormalizer) {
|
||||
(PyStripAccents {}, StripAccents.into())
|
||||
}
|
||||
}
|
||||
|
||||
@ -436,8 +436,8 @@ pub struct PyNmt {}
|
||||
#[pymethods]
|
||||
impl PyNmt {
|
||||
#[new]
|
||||
fn new() -> PyResult<(Self, PyNormalizer)> {
|
||||
Ok((PyNmt {}, Nmt.into()))
|
||||
fn new() -> (Self, PyNormalizer) {
|
||||
(PyNmt {}, Nmt.into())
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -101,10 +101,10 @@ impl PreTokenizer for PyPreTokenizer {
|
||||
#[pymethods]
|
||||
impl PyPreTokenizer {
|
||||
#[staticmethod]
|
||||
fn custom(pretok: PyObject) -> PyResult<Self> {
|
||||
Ok(PyPreTokenizer {
|
||||
fn custom(pretok: PyObject) -> Self {
|
||||
PyPreTokenizer {
|
||||
pretok: PyPreTokenizerWrapper::Custom(CustomPreTokenizer::new(pretok)).into(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn __getstate__(&self, py: Python) -> PyResult<PyObject> {
|
||||
@ -244,13 +244,13 @@ impl PyByteLevel {
|
||||
|
||||
#[new]
|
||||
#[args(add_prefix_space = "true", _kwargs = "**")]
|
||||
fn new(add_prefix_space: bool, _kwargs: Option<&PyDict>) -> PyResult<(Self, PyPreTokenizer)> {
|
||||
Ok((
|
||||
fn new(add_prefix_space: bool, _kwargs: Option<&PyDict>) -> (Self, PyPreTokenizer) {
|
||||
(
|
||||
PyByteLevel {},
|
||||
ByteLevel::default()
|
||||
.add_prefix_space(add_prefix_space)
|
||||
.into(),
|
||||
))
|
||||
)
|
||||
}
|
||||
|
||||
/// Returns the alphabet used by this PreTokenizer.
|
||||
@ -278,8 +278,8 @@ pub struct PyWhitespace {}
|
||||
#[pymethods]
|
||||
impl PyWhitespace {
|
||||
#[new]
|
||||
fn new() -> PyResult<(Self, PyPreTokenizer)> {
|
||||
Ok((PyWhitespace {}, Whitespace::default().into()))
|
||||
fn new() -> (Self, PyPreTokenizer) {
|
||||
(PyWhitespace {}, Whitespace::default().into())
|
||||
}
|
||||
}
|
||||
|
||||
@ -290,8 +290,8 @@ pub struct PyWhitespaceSplit {}
|
||||
#[pymethods]
|
||||
impl PyWhitespaceSplit {
|
||||
#[new]
|
||||
fn new() -> PyResult<(Self, PyPreTokenizer)> {
|
||||
Ok((PyWhitespaceSplit {}, WhitespaceSplit.into()))
|
||||
fn new() -> (Self, PyPreTokenizer) {
|
||||
(PyWhitespaceSplit {}, WhitespaceSplit.into())
|
||||
}
|
||||
}
|
||||
|
||||
@ -332,8 +332,8 @@ impl PySplit {
|
||||
))
|
||||
}
|
||||
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<&'p PyTuple> {
|
||||
Ok(PyTuple::new(py, &[" ", "removed"]))
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||
PyTuple::new(py, &[" ", "removed"])
|
||||
}
|
||||
}
|
||||
|
||||
@ -364,8 +364,8 @@ impl PyCharDelimiterSplit {
|
||||
))
|
||||
}
|
||||
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<&'p PyTuple> {
|
||||
Ok(PyTuple::new(py, &[" "]))
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||
PyTuple::new(py, &[" "])
|
||||
}
|
||||
}
|
||||
|
||||
@ -379,8 +379,8 @@ pub struct PyBertPreTokenizer {}
|
||||
#[pymethods]
|
||||
impl PyBertPreTokenizer {
|
||||
#[new]
|
||||
fn new() -> PyResult<(Self, PyPreTokenizer)> {
|
||||
Ok((PyBertPreTokenizer {}, BertPreTokenizer.into()))
|
||||
fn new() -> (Self, PyPreTokenizer) {
|
||||
(PyBertPreTokenizer {}, BertPreTokenizer.into())
|
||||
}
|
||||
}
|
||||
|
||||
@ -391,8 +391,8 @@ pub struct PyPunctuation {}
|
||||
#[pymethods]
|
||||
impl PyPunctuation {
|
||||
#[new]
|
||||
fn new() -> PyResult<(Self, PyPreTokenizer)> {
|
||||
Ok((PyPunctuation {}, Punctuation.into()))
|
||||
fn new() -> (Self, PyPreTokenizer) {
|
||||
(PyPunctuation {}, Punctuation.into())
|
||||
}
|
||||
}
|
||||
|
||||
@ -420,8 +420,8 @@ impl PySequence {
|
||||
))
|
||||
}
|
||||
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<&'p PyTuple> {
|
||||
Ok(PyTuple::new(py, &[PyList::empty(py)]))
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||
PyTuple::new(py, &[PyList::empty(py)])
|
||||
}
|
||||
}
|
||||
|
||||
@ -465,11 +465,11 @@ impl PyMetaspace {
|
||||
|
||||
#[new]
|
||||
#[args(replacement = "PyChar('▁')", add_prefix_space = "true")]
|
||||
fn new(replacement: PyChar, add_prefix_space: bool) -> PyResult<(Self, PyPreTokenizer)> {
|
||||
Ok((
|
||||
fn new(replacement: PyChar, add_prefix_space: bool) -> (Self, PyPreTokenizer) {
|
||||
(
|
||||
PyMetaspace {},
|
||||
Metaspace::new(replacement.0, add_prefix_space).into(),
|
||||
))
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@ -501,8 +501,8 @@ impl PyDigits {
|
||||
|
||||
#[new]
|
||||
#[args(individual_digits = false)]
|
||||
fn new(individual_digits: bool) -> PyResult<(Self, PyPreTokenizer)> {
|
||||
Ok((PyDigits {}, Digits::new(individual_digits).into()))
|
||||
fn new(individual_digits: bool) -> (Self, PyPreTokenizer) {
|
||||
(PyDigits {}, Digits::new(individual_digits).into())
|
||||
}
|
||||
}
|
||||
|
||||
@ -516,8 +516,8 @@ pub struct PyUnicodeScripts {}
|
||||
#[pymethods]
|
||||
impl PyUnicodeScripts {
|
||||
#[new]
|
||||
fn new() -> PyResult<(Self, PyPreTokenizer)> {
|
||||
Ok((PyUnicodeScripts {}, UnicodeScripts::new().into()))
|
||||
fn new() -> (Self, PyPreTokenizer) {
|
||||
(PyUnicodeScripts {}, UnicodeScripts::new().into())
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -155,15 +155,15 @@ pub struct PyBertProcessing {}
|
||||
#[pymethods]
|
||||
impl PyBertProcessing {
|
||||
#[new]
|
||||
fn new(sep: (String, u32), cls: (String, u32)) -> PyResult<(Self, PyPostProcessor)> {
|
||||
Ok((
|
||||
fn new(sep: (String, u32), cls: (String, u32)) -> (Self, PyPostProcessor) {
|
||||
(
|
||||
PyBertProcessing {},
|
||||
PyPostProcessor::new(Arc::new(BertProcessing::new(sep, cls).into())),
|
||||
))
|
||||
)
|
||||
}
|
||||
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<&'p PyTuple> {
|
||||
Ok(PyTuple::new(py, &[("", 0), ("", 0)]))
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||
PyTuple::new(py, &[("", 0), ("", 0)])
|
||||
}
|
||||
}
|
||||
|
||||
@ -203,18 +203,18 @@ impl PyRobertaProcessing {
|
||||
cls: (String, u32),
|
||||
trim_offsets: bool,
|
||||
add_prefix_space: bool,
|
||||
) -> PyResult<(Self, PyPostProcessor)> {
|
||||
) -> (Self, PyPostProcessor) {
|
||||
let proc = RobertaProcessing::new(sep, cls)
|
||||
.trim_offsets(trim_offsets)
|
||||
.add_prefix_space(add_prefix_space);
|
||||
Ok((
|
||||
(
|
||||
PyRobertaProcessing {},
|
||||
PyPostProcessor::new(Arc::new(proc.into())),
|
||||
))
|
||||
)
|
||||
}
|
||||
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<&'p PyTuple> {
|
||||
Ok(PyTuple::new(py, &[("", 0), ("", 0)]))
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||
PyTuple::new(py, &[("", 0), ("", 0)])
|
||||
}
|
||||
}
|
||||
|
||||
@ -233,20 +233,17 @@ pub struct PyByteLevel {}
|
||||
impl PyByteLevel {
|
||||
#[new]
|
||||
#[args(trim_offsets = "None", _kwargs = "**")]
|
||||
fn new(
|
||||
trim_offsets: Option<bool>,
|
||||
_kwargs: Option<&PyDict>,
|
||||
) -> PyResult<(Self, PyPostProcessor)> {
|
||||
fn new(trim_offsets: Option<bool>, _kwargs: Option<&PyDict>) -> (Self, PyPostProcessor) {
|
||||
let mut byte_level = ByteLevel::default();
|
||||
|
||||
if let Some(to) = trim_offsets {
|
||||
byte_level = byte_level.trim_offsets(to);
|
||||
}
|
||||
|
||||
Ok((
|
||||
(
|
||||
PyByteLevel {},
|
||||
PyPostProcessor::new(Arc::new(byte_level.into())),
|
||||
))
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -25,21 +25,21 @@ impl PyToken {
|
||||
}
|
||||
|
||||
#[getter]
|
||||
fn get_id(&self) -> PyResult<u32> {
|
||||
Ok(self.token.id)
|
||||
fn get_id(&self) -> u32 {
|
||||
self.token.id
|
||||
}
|
||||
|
||||
#[getter]
|
||||
fn get_value(&self) -> PyResult<&str> {
|
||||
Ok(&self.token.value)
|
||||
fn get_value(&self) -> &str {
|
||||
&self.token.value
|
||||
}
|
||||
|
||||
#[getter]
|
||||
fn get_offsets(&self) -> PyResult<(usize, usize)> {
|
||||
Ok(self.token.offsets)
|
||||
fn get_offsets(&self) -> (usize, usize) {
|
||||
self.token.offsets
|
||||
}
|
||||
|
||||
fn as_tuple(&self) -> PyResult<(u32, &str, (usize, usize))> {
|
||||
Ok((self.token.id, &self.token.value, self.token.offsets))
|
||||
fn as_tuple(&self) -> (u32, &str, (usize, usize)) {
|
||||
(self.token.id, &self.token.value, self.token.offsets)
|
||||
}
|
||||
}
|
||||
|
@ -487,10 +487,9 @@ impl PyTokenizer {
|
||||
}
|
||||
}
|
||||
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<&'p PyTuple> {
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||
let model = PyModel::from(BPE::default()).into_py(py);
|
||||
let args = PyTuple::new(py, vec![model]);
|
||||
Ok(args)
|
||||
PyTuple::new(py, vec![model])
|
||||
}
|
||||
|
||||
/// Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string.
|
||||
@ -577,11 +576,10 @@ impl PyTokenizer {
|
||||
/// :param is_pair: Boolean indicating if the input would be a single sentence or a pair
|
||||
/// :return:
|
||||
#[text_signature = "(self, is_pair)"]
|
||||
fn num_special_tokens_to_add(&self, is_pair: bool) -> PyResult<usize> {
|
||||
Ok(self
|
||||
.tokenizer
|
||||
fn num_special_tokens_to_add(&self, is_pair: bool) -> usize {
|
||||
self.tokenizer
|
||||
.get_post_processor()
|
||||
.map_or(0, |p| p.added_tokens(is_pair)))
|
||||
.map_or(0, |p| p.added_tokens(is_pair))
|
||||
}
|
||||
|
||||
/// Get the underlying vocabulary
|
||||
@ -594,8 +592,8 @@ impl PyTokenizer {
|
||||
/// :obj:`Dict[str, int]`: The vocabulary
|
||||
#[args(with_added_tokens = true)]
|
||||
#[text_signature = "(self, with_added_tokens=True)"]
|
||||
fn get_vocab(&self, with_added_tokens: bool) -> PyResult<HashMap<String, u32>> {
|
||||
Ok(self.tokenizer.get_vocab(with_added_tokens))
|
||||
fn get_vocab(&self, with_added_tokens: bool) -> HashMap<String, u32> {
|
||||
self.tokenizer.get_vocab(with_added_tokens)
|
||||
}
|
||||
|
||||
/// Get the size of the underlying vocabulary
|
||||
@ -608,8 +606,8 @@ impl PyTokenizer {
|
||||
/// :obj:`int`: The size of the vocabulary
|
||||
#[args(with_added_tokens = true)]
|
||||
#[text_signature = "(self, with_added_tokens=True)"]
|
||||
fn get_vocab_size(&self, with_added_tokens: bool) -> PyResult<usize> {
|
||||
Ok(self.tokenizer.get_vocab_size(with_added_tokens))
|
||||
fn get_vocab_size(&self, with_added_tokens: bool) -> usize {
|
||||
self.tokenizer.get_vocab_size(with_added_tokens)
|
||||
}
|
||||
|
||||
/// Enable truncation
|
||||
|
@ -177,7 +177,7 @@ impl UnigramTrainer {
|
||||
&self,
|
||||
sentences: &[Sentence],
|
||||
_progress: &Option<ProgressBar>,
|
||||
) -> Result<Vec<SentencePiece>> {
|
||||
) -> Vec<SentencePiece> {
|
||||
// Put all sentences in a string, separated by \0
|
||||
let total: usize = sentences
|
||||
.iter()
|
||||
@ -245,7 +245,7 @@ impl UnigramTrainer {
|
||||
}
|
||||
}
|
||||
to_log_prob(&mut seed_sentencepieces);
|
||||
Ok(seed_sentencepieces)
|
||||
seed_sentencepieces
|
||||
}
|
||||
fn prune_sentence_pieces(
|
||||
&self,
|
||||
@ -469,7 +469,7 @@ impl UnigramTrainer {
|
||||
|
||||
// We use a UNK token when training, whatever the `self.unk_token`
|
||||
pieces.push(("<UNK>".into(), f64::NAN));
|
||||
pieces.extend(self.make_seed_sentence_pieces(&sentences, &progress)?);
|
||||
pieces.extend(self.make_seed_sentence_pieces(&sentences, &progress));
|
||||
self.finalize_progress(&progress, sentences.len());
|
||||
|
||||
// Useful to check compatibility with spm.
|
||||
@ -604,9 +604,7 @@ mod tests {
|
||||
assert_eq!(required_chars.len(), 13);
|
||||
|
||||
let progress = None;
|
||||
let table = trainer
|
||||
.make_seed_sentence_pieces(&sentences, &progress)
|
||||
.unwrap();
|
||||
let table = trainer.make_seed_sentence_pieces(&sentences, &progress);
|
||||
|
||||
let target_strings = vec![
|
||||
"s", "i", " ", "達", "友", "ん", "は", "に", "ち", "こ", "h", "a", "T", "is ", "s ",
|
||||
|
Reference in New Issue
Block a user