#![warn(clippy::all)] extern crate tokenizers as tk; mod decoders; mod encoding; mod error; mod models; mod normalizers; mod pre_tokenizers; mod processors; mod token; mod tokenizer; mod trainers; use pyo3::prelude::*; use pyo3::wrap_pymodule; // For users using multiprocessing in python, it is quite easy to fork the process running // tokenizers, ending up with a deadlock because we internaly make use of multithreading. So // we register a callback to be called in the event of a fork so that we can warn the user. static mut REGISTERED_FORK_CALLBACK: bool = false; extern "C" fn child_after_fork() { use tk::parallelism::*; if has_parallelism_been_used() && !is_parallelism_configured() { println!( "huggingface/tokenizers: The current process just got forked, after parallelism has \ already been used. Disabling parallelism to avoid deadlocks..." ); println!("To disable this warning, you can either:"); println!( "\t- Avoid using `tokenizers` before the fork if possible\n\ \t- Explicitly set the environment variable {}=(true | false)", ENV_VARIABLE ); set_parallelism(false); } } /// Trainers Module #[pymodule] fn trainers(_py: Python, m: &PyModule) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; Ok(()) } /// Models Module #[pymodule] fn models(_py: Python, m: &PyModule) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; Ok(()) } /// PreTokenizers Module #[pymodule] fn pre_tokenizers(_py: Python, m: &PyModule) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; Ok(()) } /// Decoders Module #[pymodule] fn decoders(_py: Python, m: &PyModule) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; Ok(()) } /// Processors Module #[pymodule] fn processors(_py: Python, m: &PyModule) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; Ok(()) } /// Normalizers Module #[pymodule] fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; Ok(()) } /// Tokenizers Module #[pymodule] fn tokenizers(_py: Python, m: &PyModule) -> PyResult<()> { env_logger::init_from_env("TOKENIZERS_LOG"); // Register the fork callback #[cfg(target_family = "unix")] unsafe { if !REGISTERED_FORK_CALLBACK { libc::pthread_atfork(None, None, Some(child_after_fork)); REGISTERED_FORK_CALLBACK = true; } } m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_wrapped(wrap_pymodule!(models))?; m.add_wrapped(wrap_pymodule!(pre_tokenizers))?; m.add_wrapped(wrap_pymodule!(decoders))?; m.add_wrapped(wrap_pymodule!(processors))?; m.add_wrapped(wrap_pymodule!(normalizers))?; m.add_wrapped(wrap_pymodule!(trainers))?; Ok(()) }