mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-30 20:19:21 +00:00
Python - Add bindings to Tokenizer.from_pretrained
This commit is contained in:
964
bindings/python/Cargo.lock
generated
964
bindings/python/Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -858,6 +858,26 @@ class Tokenizer:
|
|||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
def from_pretrained(identifier, revision="main", auth_token=None):
|
||||||
|
"""
|
||||||
|
Instantiate a new :class:`~tokenizers.Tokenizer` from an existing file on the
|
||||||
|
Hugging Face Hub.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
identifier (:obj:`str`):
|
||||||
|
The identifier of a Model on the Hugging Face Hub, that contains
|
||||||
|
a tokenizer.json file
|
||||||
|
revision (:obj:`str`, defaults to `main`):
|
||||||
|
A branch or commit id
|
||||||
|
auth_token (:obj:`str`, `optional`, defaults to `None`):
|
||||||
|
An optional auth token used to access private repositories on the
|
||||||
|
Hugging Face Hub
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:class:`~tokenizers.Tokenizer`: The new tokenizer
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
@staticmethod
|
||||||
def from_str(json):
|
def from_str(json):
|
||||||
"""
|
"""
|
||||||
Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string.
|
Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string.
|
||||||
|
@ -18,6 +18,8 @@ mod utils;
|
|||||||
use pyo3::prelude::*;
|
use pyo3::prelude::*;
|
||||||
use pyo3::wrap_pymodule;
|
use pyo3::wrap_pymodule;
|
||||||
|
|
||||||
|
pub const VERSION: &str = env!("CARGO_PKG_VERSION");
|
||||||
|
|
||||||
// For users using multiprocessing in python, it is quite easy to fork the process running
|
// For users using multiprocessing in python, it is quite easy to fork the process running
|
||||||
// tokenizers, ending up with a deadlock because we internaly make use of multithreading. So
|
// tokenizers, ending up with a deadlock because we internaly make use of multithreading. So
|
||||||
// we register a callback to be called in the event of a fork so that we can warn the user.
|
// we register a callback to be called in the event of a fork so that we can warn the user.
|
||||||
|
@ -544,6 +544,43 @@ impl PyTokenizer {
|
|||||||
Ok(Self { tokenizer })
|
Ok(Self { tokenizer })
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Instantiate a new :class:`~tokenizers.Tokenizer` from an existing file on the
|
||||||
|
/// Hugging Face Hub.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// identifier (:obj:`str`):
|
||||||
|
/// The identifier of a Model on the Hugging Face Hub, that contains
|
||||||
|
/// a tokenizer.json file
|
||||||
|
/// revision (:obj:`str`, defaults to `main`):
|
||||||
|
/// A branch or commit id
|
||||||
|
/// auth_token (:obj:`str`, `optional`, defaults to `None`):
|
||||||
|
/// An optional auth token used to access private repositories on the
|
||||||
|
/// Hugging Face Hub
|
||||||
|
///
|
||||||
|
/// Returns:
|
||||||
|
/// :class:`~tokenizers.Tokenizer`: The new tokenizer
|
||||||
|
#[staticmethod]
|
||||||
|
#[args(revision = "String::from(\"main\")", auth_token = "None")]
|
||||||
|
#[text_signature = "(identifier, revision=\"main\", auth_token=None)"]
|
||||||
|
fn from_pretrained(
|
||||||
|
identifier: &str,
|
||||||
|
revision: String,
|
||||||
|
auth_token: Option<String>,
|
||||||
|
) -> PyResult<Self> {
|
||||||
|
let params = tk::utils::from_pretrained::FromPretrainedParameters {
|
||||||
|
revision,
|
||||||
|
auth_token,
|
||||||
|
user_agent: [("bindings", "Python"), ("version", crate::VERSION)]
|
||||||
|
.iter()
|
||||||
|
.map(|(k, v)| (k.to_string(), v.to_string()))
|
||||||
|
.collect(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let tokenizer: PyResult<_> =
|
||||||
|
ToPyResult(Tokenizer::from_pretrained(identifier, Some(params))).into();
|
||||||
|
Ok(Self::new(tokenizer?))
|
||||||
|
}
|
||||||
|
|
||||||
/// Gets a serialized string representing this :class:`~tokenizers.Tokenizer`.
|
/// Gets a serialized string representing this :class:`~tokenizers.Tokenizer`.
|
||||||
///
|
///
|
||||||
/// Args:
|
/// Args:
|
||||||
|
Reference in New Issue
Block a user