mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Automatically stubbing the pyi
files while keeping inspecting ability (#509)
* First pass on automatic stubbing our python files. * And now modifying all rust docs to be visible in Pyi files. * Better assert fail message. * Fixing github workflow. * Removing types not exported anymore. * Fixing `Tokenizer` signature. * Disabling auto __init__.py. * Re-enabling some types. * Don't overwrite non automated __init__.py * Automated most __init__.py * Restubbing after rebase. * Fixing env for tests. * Install blakc in the env. * Use PY35 target in stub.py Co-authored-by: Anthony MOI <m.anthony.moi@gmail.com>
This commit is contained in:
@ -15,6 +15,10 @@ use tk::normalizers::{
|
||||
use tk::{NormalizedString, Normalizer};
|
||||
use tokenizers as tk;
|
||||
|
||||
/// Base class for all normalizers
|
||||
///
|
||||
/// This class is not supposed to be instantiated directly. Instead, any implementation of a
|
||||
/// Normalizer will return an instance of this class when instantiated.
|
||||
#[pyclass(dict, module = "tokenizers.normalizers", name=Normalizer)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct PyNormalizer {
|
||||
@ -105,10 +109,14 @@ impl PyNormalizer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Normalize the given NormalizedString in-place
|
||||
#[text_signature = "(self, normalized)"]
|
||||
fn normalize(&self, normalized: &mut PyNormalizedString) -> PyResult<()> {
|
||||
ToPyResult(self.normalizer.normalize(&mut normalized.normalized)).into()
|
||||
}
|
||||
|
||||
/// Normalize the given str
|
||||
#[text_signature = "(self, sequence)"]
|
||||
fn normalize_str(&self, sequence: &str) -> PyResult<String> {
|
||||
let mut normalized = NormalizedString::from(sequence);
|
||||
ToPyResult(self.normalizer.normalize(&mut normalized)).into_py()?;
|
||||
@ -116,7 +124,30 @@ impl PyNormalizer {
|
||||
}
|
||||
}
|
||||
|
||||
/// BertNormalizer
|
||||
///
|
||||
/// Takes care of normalizing raw text before giving it to a Bert model.
|
||||
/// This includes cleaning the text, handling accents, chinese chars and lowercasing
|
||||
///
|
||||
/// Args:
|
||||
/// clean_text: (`optional`) boolean:
|
||||
/// Whether to clean the text, by removing any control characters
|
||||
/// and replacing all whitespaces by the classic one.
|
||||
///
|
||||
/// handle_chinese_chars: (`optional`) boolean:
|
||||
/// Whether to handle chinese chars by putting spaces around them.
|
||||
///
|
||||
/// strip_accents: (`optional`) boolean:
|
||||
/// Whether to strip all accents. If this option is not specified (ie == None),
|
||||
/// then it will be determined by the value for `lowercase` (as in the original Bert).
|
||||
///
|
||||
/// lowercase: (`optional`) boolean:
|
||||
/// Whether to lowercase.
|
||||
///
|
||||
/// Returns:
|
||||
/// Normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=BertNormalizer)]
|
||||
#[text_signature = "(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True)"]
|
||||
pub struct PyBertNormalizer {}
|
||||
#[pymethods]
|
||||
impl PyBertNormalizer {
|
||||
@ -146,7 +177,9 @@ impl PyBertNormalizer {
|
||||
}
|
||||
}
|
||||
|
||||
/// NFD Unicode Normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFD)]
|
||||
#[text_signature = "(self)"]
|
||||
pub struct PyNFD {}
|
||||
#[pymethods]
|
||||
impl PyNFD {
|
||||
@ -156,7 +189,9 @@ impl PyNFD {
|
||||
}
|
||||
}
|
||||
|
||||
/// NFKD Unicode Normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFKD)]
|
||||
#[text_signature = "(self)"]
|
||||
pub struct PyNFKD {}
|
||||
#[pymethods]
|
||||
impl PyNFKD {
|
||||
@ -166,7 +201,9 @@ impl PyNFKD {
|
||||
}
|
||||
}
|
||||
|
||||
/// NFC Unicode Normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFC)]
|
||||
#[text_signature = "(self)"]
|
||||
pub struct PyNFC {}
|
||||
#[pymethods]
|
||||
impl PyNFC {
|
||||
@ -176,7 +213,9 @@ impl PyNFC {
|
||||
}
|
||||
}
|
||||
|
||||
/// NFKC Unicode Normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFKC)]
|
||||
#[text_signature = "(self)"]
|
||||
pub struct PyNFKC {}
|
||||
#[pymethods]
|
||||
impl PyNFKC {
|
||||
@ -186,6 +225,12 @@ impl PyNFKC {
|
||||
}
|
||||
}
|
||||
|
||||
/// Allows concatenating multiple other Normalizer as a Sequence.
|
||||
/// All the normalizers run in sequence in the given order
|
||||
///
|
||||
/// Args:
|
||||
/// normalizers: List[Normalizer]:
|
||||
/// A list of Normalizer to be run as a sequence
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Sequence)]
|
||||
pub struct PySequence {}
|
||||
#[pymethods]
|
||||
@ -211,7 +256,9 @@ impl PySequence {
|
||||
}
|
||||
}
|
||||
|
||||
/// Lowercase Normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Lowercase)]
|
||||
#[text_signature = "(self)"]
|
||||
pub struct PyLowercase {}
|
||||
#[pymethods]
|
||||
impl PyLowercase {
|
||||
@ -221,7 +268,9 @@ impl PyLowercase {
|
||||
}
|
||||
}
|
||||
|
||||
/// Strip normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Strip)]
|
||||
#[text_signature = "(self, left=True, right=True)"]
|
||||
pub struct PyStrip {}
|
||||
#[pymethods]
|
||||
impl PyStrip {
|
||||
@ -245,6 +294,7 @@ impl PyStrip {
|
||||
}
|
||||
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=StripAccents)]
|
||||
#[text_signature = "(self)"]
|
||||
pub struct PyStripAccents {}
|
||||
#[pymethods]
|
||||
impl PyStripAccents {
|
||||
@ -389,7 +439,9 @@ impl Normalizer for PyNormalizerWrapper {
|
||||
}
|
||||
}
|
||||
|
||||
/// Nmt normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Nmt)]
|
||||
#[text_signature = "(self)"]
|
||||
pub struct PyNmt {}
|
||||
#[pymethods]
|
||||
impl PyNmt {
|
||||
@ -399,7 +451,10 @@ impl PyNmt {
|
||||
}
|
||||
}
|
||||
|
||||
/// Precompiled normalizer
|
||||
/// Don't use manually it is used for compatiblity for SentencePiece.
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Precompiled)]
|
||||
#[text_signature = "(self, precompiled_charsmap)"]
|
||||
pub struct PyPrecompiled {}
|
||||
#[pymethods]
|
||||
impl PyPrecompiled {
|
||||
@ -420,7 +475,9 @@ impl PyPrecompiled {
|
||||
}
|
||||
}
|
||||
|
||||
/// Replace normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Replace)]
|
||||
#[text_signature = "(self, pattern, content)"]
|
||||
pub struct PyReplace {}
|
||||
#[pymethods]
|
||||
impl PyReplace {
|
||||
|
Reference in New Issue
Block a user