mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-31 04:29:21 +00:00
Python - Add bindings to Tokenizer.from_pretrained
This commit is contained in:
964
bindings/python/Cargo.lock
generated
964
bindings/python/Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -858,6 +858,26 @@ class Tokenizer:
|
||||
"""
|
||||
pass
|
||||
@staticmethod
|
||||
def from_pretrained(identifier, revision="main", auth_token=None):
|
||||
"""
|
||||
Instantiate a new :class:`~tokenizers.Tokenizer` from an existing file on the
|
||||
Hugging Face Hub.
|
||||
|
||||
Args:
|
||||
identifier (:obj:`str`):
|
||||
The identifier of a Model on the Hugging Face Hub, that contains
|
||||
a tokenizer.json file
|
||||
revision (:obj:`str`, defaults to `main`):
|
||||
A branch or commit id
|
||||
auth_token (:obj:`str`, `optional`, defaults to `None`):
|
||||
An optional auth token used to access private repositories on the
|
||||
Hugging Face Hub
|
||||
|
||||
Returns:
|
||||
:class:`~tokenizers.Tokenizer`: The new tokenizer
|
||||
"""
|
||||
pass
|
||||
@staticmethod
|
||||
def from_str(json):
|
||||
"""
|
||||
Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string.
|
||||
|
@ -18,6 +18,8 @@ mod utils;
|
||||
use pyo3::prelude::*;
|
||||
use pyo3::wrap_pymodule;
|
||||
|
||||
pub const VERSION: &str = env!("CARGO_PKG_VERSION");
|
||||
|
||||
// For users using multiprocessing in python, it is quite easy to fork the process running
|
||||
// tokenizers, ending up with a deadlock because we internaly make use of multithreading. So
|
||||
// we register a callback to be called in the event of a fork so that we can warn the user.
|
||||
|
@ -544,6 +544,43 @@ impl PyTokenizer {
|
||||
Ok(Self { tokenizer })
|
||||
}
|
||||
|
||||
/// Instantiate a new :class:`~tokenizers.Tokenizer` from an existing file on the
|
||||
/// Hugging Face Hub.
|
||||
///
|
||||
/// Args:
|
||||
/// identifier (:obj:`str`):
|
||||
/// The identifier of a Model on the Hugging Face Hub, that contains
|
||||
/// a tokenizer.json file
|
||||
/// revision (:obj:`str`, defaults to `main`):
|
||||
/// A branch or commit id
|
||||
/// auth_token (:obj:`str`, `optional`, defaults to `None`):
|
||||
/// An optional auth token used to access private repositories on the
|
||||
/// Hugging Face Hub
|
||||
///
|
||||
/// Returns:
|
||||
/// :class:`~tokenizers.Tokenizer`: The new tokenizer
|
||||
#[staticmethod]
|
||||
#[args(revision = "String::from(\"main\")", auth_token = "None")]
|
||||
#[text_signature = "(identifier, revision=\"main\", auth_token=None)"]
|
||||
fn from_pretrained(
|
||||
identifier: &str,
|
||||
revision: String,
|
||||
auth_token: Option<String>,
|
||||
) -> PyResult<Self> {
|
||||
let params = tk::utils::from_pretrained::FromPretrainedParameters {
|
||||
revision,
|
||||
auth_token,
|
||||
user_agent: [("bindings", "Python"), ("version", crate::VERSION)]
|
||||
.iter()
|
||||
.map(|(k, v)| (k.to_string(), v.to_string()))
|
||||
.collect(),
|
||||
};
|
||||
|
||||
let tokenizer: PyResult<_> =
|
||||
ToPyResult(Tokenizer::from_pretrained(identifier, Some(params))).into();
|
||||
Ok(Self::new(tokenizer?))
|
||||
}
|
||||
|
||||
/// Gets a serialized string representing this :class:`~tokenizers.Tokenizer`.
|
||||
///
|
||||
/// Args:
|
||||
|
Reference in New Issue
Block a user