Python - Add bindings to Tokenizer.from_pretrained

This commit is contained in:
Anthony Moi
2021-08-19 11:45:23 +02:00
committed by Anthony MOI
parent e71e5be64f
commit e44fdee4a1
4 changed files with 1022 additions and 1 deletions

File diff suppressed because it is too large Load Diff

View File

@ -858,6 +858,26 @@ class Tokenizer:
"""
pass
@staticmethod
def from_pretrained(identifier, revision="main", auth_token=None):
"""
Instantiate a new :class:`~tokenizers.Tokenizer` from an existing file on the
Hugging Face Hub.
Args:
identifier (:obj:`str`):
The identifier of a Model on the Hugging Face Hub, that contains
a tokenizer.json file
revision (:obj:`str`, defaults to `main`):
A branch or commit id
auth_token (:obj:`str`, `optional`, defaults to `None`):
An optional auth token used to access private repositories on the
Hugging Face Hub
Returns:
:class:`~tokenizers.Tokenizer`: The new tokenizer
"""
pass
@staticmethod
def from_str(json):
"""
Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string.

View File

@ -18,6 +18,8 @@ mod utils;
use pyo3::prelude::*;
use pyo3::wrap_pymodule;
pub const VERSION: &str = env!("CARGO_PKG_VERSION");
// For users using multiprocessing in python, it is quite easy to fork the process running
// tokenizers, ending up with a deadlock because we internaly make use of multithreading. So
// we register a callback to be called in the event of a fork so that we can warn the user.

View File

@ -544,6 +544,43 @@ impl PyTokenizer {
Ok(Self { tokenizer })
}
/// Instantiate a new :class:`~tokenizers.Tokenizer` from an existing file on the
/// Hugging Face Hub.
///
/// Args:
/// identifier (:obj:`str`):
/// The identifier of a Model on the Hugging Face Hub, that contains
/// a tokenizer.json file
/// revision (:obj:`str`, defaults to `main`):
/// A branch or commit id
/// auth_token (:obj:`str`, `optional`, defaults to `None`):
/// An optional auth token used to access private repositories on the
/// Hugging Face Hub
///
/// Returns:
/// :class:`~tokenizers.Tokenizer`: The new tokenizer
#[staticmethod]
#[args(revision = "String::from(\"main\")", auth_token = "None")]
#[text_signature = "(identifier, revision=\"main\", auth_token=None)"]
fn from_pretrained(
identifier: &str,
revision: String,
auth_token: Option<String>,
) -> PyResult<Self> {
let params = tk::utils::from_pretrained::FromPretrainedParameters {
revision,
auth_token,
user_agent: [("bindings", "Python"), ("version", crate::VERSION)]
.iter()
.map(|(k, v)| (k.to_string(), v.to_string()))
.collect(),
};
let tokenizer: PyResult<_> =
ToPyResult(Tokenizer::from_pretrained(identifier, Some(params))).into();
Ok(Self::new(tokenizer?))
}
/// Gets a serialized string representing this :class:`~tokenizers.Tokenizer`.
///
/// Args: