Temp work to make the APIs uniform (build from memory by default).

2025-12-09 22:28:29 +00:00 · 2020-09-22 09:41:07 +02:00
parent b24a2fc178
commit 98a30eead1
16 changed files with 438 additions and 162 deletions
--- a/bindings/node/lib/bindings/models.js
+++ b/bindings/node/lib/bindings/models.js
@@ -2,10 +2,12 @@ const native = require("./native");

 module.exports = {
  BPE: {
+    init: native.models_BPE_init,
    fromFiles: native.models_BPE_from_files,
    empty: native.models_BPE_empty,
  },
  WordPiece: {
+    init: native.models_WordPiece_init,
    fromFiles: native.models_WordPiece_from_files,
    empty: native.models_WordPiece_empty,
  },
--- a/bindings/node/native/src/models.rs
+++ b/bindings/node/native/src/models.rs
@@ -128,17 +128,14 @@ impl BpeOptions {
    }
 }

-/// bpe_from_files(vocab: String, merges: String, options: {
+/// bpe_init(vocab: Map<String, u32>, merges: Map<(u32, u32), (u32, u32)>, options: {
 ///   cacheCapacity?: number,
 ///   dropout?: number,
 ///   unkToken?: String,
 ///   continuingSubwordPrefix?: String,
 ///   endOfWordSuffix?: String
 /// }, callback)
-pub fn bpe_from_files(mut cx: FunctionContext) -> JsResult<JsUndefined> {
-    let vocab = cx.extract::<String>(0)?;
-    let merges = cx.extract::<String>(1)?;
-
+pub fn bpe_init(mut cx: FunctionContext) -> JsResult<JsUndefined> {
    let (options, callback) = match cx.extract_opt::<BpeOptions>(2) {
        // Options were there, and extracted
        Ok(Some(options)) => (options, cx.argument::<JsFunction>(3)?),
@@ -147,8 +144,38 @@ pub fn bpe_from_files(mut cx: FunctionContext) -> JsResult<JsUndefined> {
        // Options not specified, callback instead
        Err(_) => (BpeOptions::default(), cx.argument::<JsFunction>(2)?),
    };
+    let vocab = cx.extract::<HashMap<String, u32>>(0)?;
+    let merges = cx.extract::<HashMap<(u32, u32), (u32, u32)>>(1)?;

+    let mut builder = tk::models::bpe::BPE::builder().vocab_and_merges(vocab, merges);
+
+    builder = options.apply_to_bpe_builder(builder);
+
+    let task = BPEFromFilesTask::new(builder);
+    task.schedule(callback);
+    Ok(cx.undefined())
+}
+
+/// bpe_from_files(vocab: String, merges: String, options: {
+///   cacheCapacity?: number,
+///   dropout?: number,
+///   unkToken?: String,
+///   continuingSubwordPrefix?: String,
+///   endOfWordSuffix?: String
+/// }, callback)
+pub fn bpe_from_files(mut cx: FunctionContext) -> JsResult<JsUndefined> {
+    let (options, callback) = match cx.extract_opt::<BpeOptions>(2) {
+        // Options were there, and extracted
+        Ok(Some(options)) => (options, cx.argument::<JsFunction>(3)?),
+        // Options were undefined or null
+        Ok(None) => (BpeOptions::default(), cx.argument::<JsFunction>(3)?),
+        // Options not specified, callback instead
+        Err(_) => (BpeOptions::default(), cx.argument::<JsFunction>(2)?),
+    };
+    let vocab = cx.extract::<String>(0)?;
+    let merges = cx.extract::<String>(1)?;
    let mut builder = tk::models::bpe::BPE::from_files(&vocab, &merges);
+
    builder = options.apply_to_bpe_builder(builder);

    let task = BPEFromFilesTask::new(builder);
@@ -190,14 +217,12 @@ impl WordPieceOptions {
    }
 }

-/// wordpiece_from_files(vocab: String, options: {
+/// wordpiece_init(vocab: Map<String, u32>, options: {
 ///   unkToken?: String = "[UNK]",
 ///   maxInputCharsPerWord?: number = 100,
 ///   continuingSubwordPrefix?: "##",
 /// }, callback)
-pub fn wordpiece_from_files(mut cx: FunctionContext) -> JsResult<JsUndefined> {
-    let vocab = cx.extract::<String>(0)?;
-
+pub fn wordpiece_init(mut cx: FunctionContext) -> JsResult<JsUndefined> {
    let (options, callback) = match cx.extract_opt::<WordPieceOptions>(1) {
        // Options were there, and extracted
        Ok(Some(options)) => (options, cx.argument::<JsFunction>(2)?),
@@ -207,11 +232,36 @@ pub fn wordpiece_from_files(mut cx: FunctionContext) -> JsResult<JsUndefined> {
        Err(_) => (WordPieceOptions::default(), cx.argument::<JsFunction>(1)?),
    };

-    let mut builder = tk::models::wordpiece::WordPiece::from_files(&vocab);
-    builder = options.apply_to_wordpiece_builder(builder);
+    let vocab = cx.extract::<HashMap<String, u32>>(0)?;

+    let mut builder = tk::models::wordpiece::WordPiece::builder().vocab(vocab);
+    builder = options.apply_to_wordpiece_builder(builder);
    let task = WordPieceFromFilesTask::new(builder);
    task.schedule(callback);
+
+    Ok(cx.undefined())
+}
+
+/// wordpiece_from_files(vocab: String, options: {
+///   unkToken?: String = "[UNK]",
+///   maxInputCharsPerWord?: number = 100,
+///   continuingSubwordPrefix?: "##",
+/// }, callback)
+pub fn wordpiece_from_files(mut cx: FunctionContext) -> JsResult<JsUndefined> {
+    let (options, callback) = match cx.extract_opt::<WordPieceOptions>(1) {
+        // Options were there, and extracted
+        Ok(Some(options)) => (options, cx.argument::<JsFunction>(2)?),
+        // Options were undefined or null
+        Ok(None) => (WordPieceOptions::default(), cx.argument::<JsFunction>(2)?),
+        // Options not specified, callback instead
+        Err(_) => (WordPieceOptions::default(), cx.argument::<JsFunction>(1)?),
+    };
+    let vocab = cx.extract::<String>(0)?;
+    let mut builder = tk::models::wordpiece::WordPiece::from_files(&vocab);
+    builder = options.apply_to_wordpiece_builder(builder);
+    let task = WordPieceFromFilesTask::new(builder);
+    task.schedule(callback);
+
    Ok(cx.undefined())
 }

@@ -228,8 +278,10 @@ pub fn wordpiece_empty(mut cx: FunctionContext) -> JsResult<JsModel> {

 /// Register everything here
 pub fn register(m: &mut ModuleContext, prefix: &str) -> NeonResult<()> {
+    m.export_function(&format!("{}_BPE_init", prefix), bpe_init)?;
    m.export_function(&format!("{}_BPE_from_files", prefix), bpe_from_files)?;
    m.export_function(&format!("{}_BPE_empty", prefix), bpe_empty)?;
+    m.export_function(&format!("{}_WordPiece_init", prefix), wordpiece_init)?;
    m.export_function(
        &format!("{}_WordPiece_from_files", prefix),
        wordpiece_from_files,
--- a/bindings/python/py_src/tokenizers/implementations/bert_wordpiece.py
+++ b/bindings/python/py_src/tokenizers/implementations/bert_wordpiece.py
@@ -5,7 +5,7 @@ from tokenizers.pre_tokenizers import BertPreTokenizer
 from tokenizers.processors import BertProcessing
 from .base_tokenizer import BaseTokenizer

-from typing import Optional, List, Union
+from typing import Optional, List, Union, Dict


 class BertWordPieceTokenizer(BaseTokenizer):
@@ -13,7 +13,7 @@ class BertWordPieceTokenizer(BaseTokenizer):

    def __init__(
        self,
-        vocab_file: Optional[str] = None,
+        vocab: Optional[Union[str, Dict[str, int]]] = None,
        unk_token: Union[str, AddedToken] = "[UNK]",
        sep_token: Union[str, AddedToken] = "[SEP]",
        cls_token: Union[str, AddedToken] = "[CLS]",
@@ -26,8 +26,8 @@ class BertWordPieceTokenizer(BaseTokenizer):
        wordpieces_prefix: str = "##",
    ):

-        if vocab_file is not None:
-            tokenizer = Tokenizer(WordPiece(vocab_file, unk_token=str(unk_token)))
+        if vocab is not None:
+            tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(unk_token)))
        else:
            tokenizer = Tokenizer(WordPiece(unk_token=str(unk_token)))

@@ -51,7 +51,7 @@ class BertWordPieceTokenizer(BaseTokenizer):
        )
        tokenizer.pre_tokenizer = BertPreTokenizer()

-        if vocab_file is not None:
+        if vocab is not None:
            sep_token_id = tokenizer.token_to_id(str(sep_token))
            if sep_token_id is None:
                raise TypeError("sep_token not found in the vocabulary")
--- a/bindings/python/py_src/tokenizers/implementations/byte_level_bpe.py
+++ b/bindings/python/py_src/tokenizers/implementations/byte_level_bpe.py
@@ -1,21 +1,28 @@
-from tokenizers import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers, processors
+from tokenizers import (
+    Tokenizer,
+    AddedToken,
+    pre_tokenizers,
+    decoders,
+    trainers,
+    processors,
+)
 from tokenizers.models import BPE
 from tokenizers.normalizers import unicode_normalizer_from_str, Lowercase, Sequence
 from .base_tokenizer import BaseTokenizer

-from typing import Optional, List, Union
+from typing import Optional, List, Union, Dict, Tuple


 class ByteLevelBPETokenizer(BaseTokenizer):
-    """ ByteLevelBPETokenizer
+    """ByteLevelBPETokenizer

    Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
    """

    def __init__(
        self,
-        vocab_file: Optional[str] = None,
-        merges_file: Optional[str] = None,
+        vocab: Optional[Union[str, Dict[str, int]]] = None,
+        merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
        add_prefix_space: bool = False,
        lowercase: bool = False,
        dropout: Optional[float] = None,
@@ -24,11 +31,11 @@ class ByteLevelBPETokenizer(BaseTokenizer):
        end_of_word_suffix: Optional[str] = None,
        trim_offsets: bool = False,
    ):
-        if vocab_file is not None and merges_file is not None:
+        if vocab is not None and merges is not None:
            tokenizer = Tokenizer(
                BPE(
-                    vocab_file,
-                    merges_file,
+                    vocab,
+                    merges,
                    dropout=dropout,
                    continuing_subword_prefix=continuing_subword_prefix or "",
                    end_of_word_suffix=end_of_word_suffix or "",
--- a/bindings/python/py_src/tokenizers/implementations/char_level_bpe.py
+++ b/bindings/python/py_src/tokenizers/implementations/char_level_bpe.py
@@ -1,13 +1,18 @@
 from .. import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers
 from ..models import BPE
-from ..normalizers import Sequence, Lowercase, unicode_normalizer_from_str, BertNormalizer
+from ..normalizers import (
+    Sequence,
+    Lowercase,
+    unicode_normalizer_from_str,
+    BertNormalizer,
+)
 from .base_tokenizer import BaseTokenizer

-from typing import Optional, List, Union
+from typing import Optional, List, Union, Dict, Tuple


 class CharBPETokenizer(BaseTokenizer):
-    """ Original BPE Tokenizer
+    """Original BPE Tokenizer

    Represents the BPE algorithm, as introduced by Rico Sennrich
    (https://arxiv.org/abs/1508.07909)
@@ -24,8 +29,8 @@ class CharBPETokenizer(BaseTokenizer):

    def __init__(
        self,
-        vocab_file: Optional[str] = None,
-        merges_file: Optional[str] = None,
+        vocab: Optional[Union[str, Dict[str, int]]] = None,
+        merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
        unk_token: Union[str, AddedToken] = "<unk>",
        suffix: str = "</w>",
        dropout: Optional[float] = None,
@@ -34,11 +39,11 @@ class CharBPETokenizer(BaseTokenizer):
        bert_normalizer: bool = True,
        split_on_whitespace_only: bool = False,
    ):
-        if vocab_file is not None and merges_file is not None:
+        if vocab is not None and merges is not None:
            tokenizer = Tokenizer(
                BPE(
-                    vocab_file,
-                    merges_file,
+                    vocab,
+                    merges,
                    dropout=dropout,
                    unk_token=str(unk_token),
                    end_of_word_suffix=suffix,
--- a/bindings/python/py_src/tokenizers/implementations/sentencepiece_bpe.py
+++ b/bindings/python/py_src/tokenizers/implementations/sentencepiece_bpe.py
@@ -3,28 +3,26 @@ from tokenizers.models import BPE
 from tokenizers.normalizers import NFKC
 from .base_tokenizer import BaseTokenizer

-from typing import Optional, List, Union
+from typing import Optional, List, Union, Dict, Tuple


 class SentencePieceBPETokenizer(BaseTokenizer):
-    """ SentencePiece BPE Tokenizer
+    """SentencePiece BPE Tokenizer

    Represents the BPE algorithm, with the pretokenization used by SentencePiece
    """

    def __init__(
        self,
-        vocab_file: Optional[str] = None,
-        merges_file: Optional[str] = None,
+        vocab: Optional[Union[str, Dict[str, int]]] = None,
+        merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
        unk_token: Union[str, AddedToken] = "<unk>",
        replacement: str = "▁",
        add_prefix_space: bool = True,
        dropout: Optional[float] = None,
    ):
-        if vocab_file is not None and merges_file is not None:
-            tokenizer = Tokenizer(
-                BPE(vocab_file, merges_file, dropout=dropout, unk_token=unk_token)
-            )
+        if vocab is not None and merges is not None:
+            tokenizer = Tokenizer(BPE(vocab, merges, dropout=dropout, unk_token=unk_token))
        else:
            tokenizer = Tokenizer(BPE())

--- a/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py
+++ b/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py
@@ -92,19 +92,10 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
                "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
            )

-        data = {"unk_id": unk_id, "vocab": vocab}
-
        replacement = "▁"
        add_prefix_space = True

-        out_vocab_filename = f"{filename}.json"
-        try:
-            with open(out_vocab_filename, "w") as f:
-                json.dump(data, f, indent=4)
-
-            tokenizer = Tokenizer(Unigram(out_vocab_filename))
-        finally:
-            os.remove(out_vocab_filename)
+        tokenizer = Tokenizer(Unigram(vocab, unk_id))

        tokenizer.normalizer = normalizers.Precompiled(precompiled_charsmap)
        tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
--- a/bindings/python/py_src/tokenizers/models/init.pyi
+++ b/bindings/python/py_src/tokenizers/models/init.pyi
@@ -1,5 +1,5 @@
 from .. import Encoding, Offsets, Token
-from typing import List, Optional, Union, Tuple
+from typing import List, Optional, Union, Tuple, Dict

 class Model:
    """ Base class for all models
@@ -32,11 +32,15 @@ class BPE(Model):
    Instantiate a BPE Model from the given vocab and merges files.

    Args:
-       vocab: ('`optional`) string:
-           Path to a vocabulary JSON file.
+       vocab: ('`optional`) Dict[str, int]:
+           A dictionnary of string keys and their ids {"am": 0,...}

       merges: (`optional`) string:
-           Path to a merge file.
+           A dictionnary of pairs of ids as keys and their merge correspondace:
+               {(id_left, id_right): (importance, id_merged), .... }
+               with vocab : {"a": 0, "b": 1", ... "ab": 4} the merge
+               {(0, 1): (0, 4) ,...}
+               corresponds to the "ab" merge, that is the most likely merge (0)

       cache_capacity: (`optional`) int:
           The number of words that the BPE cache can contain. The cache allows
@@ -62,8 +66,8 @@ class BPE(Model):
    @staticmethod
    def __init__(
        self,
-        vocab: Optional[str],
-        merges: Optional[str],
+        vocab: Optional[Union[str, Dict[str, int]]],
+        merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]],
        cache_capacity: Optional[int],
        dropout: Optional[float],
        unk_token: Optional[str],
@@ -80,7 +84,7 @@ class WordPiece(Model):

        Args:
            vocab: (`optional`) string:
-                Path to a vocabulary file.
+                A dictionnary of string keys and their ids {"am": 0,...}

            unk_token: (`optional`) str:
                The unknown token to be used by the model.
@@ -91,7 +95,7 @@ class WordPiece(Model):

    def __init__(
        self,
-        vocab: Optional[str],
+        vocab: Optional[Union[str, Dict[str, int]]],
        unk_token: Optional[str],
        max_input_chars_per_word: Optional[int],
    ):
@@ -105,13 +109,13 @@ class WordLevel(Model):

        Args:
            vocab: (`optional`) string:
-                Path to a vocabulary file.
+                A dictionnary of string keys and their ids {"am": 0,...}

            unk_token: str:
                The unknown token to be used by the model.
    """

-    def __init__(self, vocab: Optional[str], unk_token: Optional[str]):
+    def __init__(self, vocab: Optional[Union[str, Dict[str, int]]], unk_token: Optional[str]):
        pass

 class Unigram(Model):
@@ -121,10 +125,10 @@ class Unigram(Model):

    Args:
       vocab: ('`optional`) string:
-           Path to a vocabulary JSON file.
+           A list of vocabulary items and their relative score [("am", -0.2442),...]

    """

    @staticmethod
-    def __init__(self, vocab: Optional[str]):
+    def __init__(self, vocab: Optional[List[Tuple[str, float]]]):
        pass
--- a/bindings/python/src/models.rs
+++ b/bindings/python/src/models.rs
@@ -7,16 +7,25 @@ use pyo3::exceptions;
 use pyo3::prelude::*;
 use pyo3::types::*;
 use serde::{Deserialize, Serialize};
-use tk::models::bpe::BPE;
+use tk::models::bpe::{BpeBuilder, BPE};
 use tk::models::unigram::Unigram;
 use tk::models::wordlevel::WordLevel;
-use tk::models::wordpiece::WordPiece;
+use tk::models::wordpiece::{WordPiece, WordPieceBuilder};
 use tk::models::ModelWrapper;
 use tk::{Model, Token};
 use tokenizers as tk;

 use super::error::ToPyResult;

+fn deprecation_warning(version: &str, message: &str) -> PyResult<()> {
+    let gil = pyo3::Python::acquire_gil();
+    let python = gil.python();
+    let deprecation_warning = python.import("builtins")?.get("DeprecationWarning")?;
+    let full_message = format!("Deprecated in {}: {}", version, message);
+    pyo3::PyErr::warn(python, deprecation_warning, &full_message, 0)?;
+    Ok(())
+}
+
 /// A Model represents some tokenization algorithm like BPE or Word
 /// This class cannot be constructed directly. Please use one of the concrete models.
 #[pyclass(module = "tokenizers.models", name=Model)]
@@ -137,25 +146,8 @@ impl PyModel {
 #[pyclass(extends=PyModel, module = "tokenizers.models", name=BPE)]
 pub struct PyBPE {}

-#[pymethods]
 impl PyBPE {
-    #[new]
-    #[args(kwargs = "**")]
-    fn new(
-        vocab: Option<&str>,
-        merges: Option<&str>,
-        kwargs: Option<&PyDict>,
-    ) -> PyResult<(Self, PyModel)> {
-        if (vocab.is_some() && merges.is_none()) || (vocab.is_none() && merges.is_some()) {
-            return Err(exceptions::PyValueError::new_err(
-                "`vocab` and `merges` must be both specified",
-            ));
-        }
-
-        let mut builder = BPE::builder();
-        if let (Some(vocab), Some(merges)) = (vocab, merges) {
-            builder = builder.files(vocab.to_owned(), merges.to_owned());
-        }
+    fn with_builder(mut builder: BpeBuilder, kwargs: Option<&PyDict>) -> PyResult<(Self, PyModel)> {
        if let Some(kwargs) = kwargs {
            for (key, value) in kwargs {
                let key: &str = key.extract()?;
@@ -191,21 +183,62 @@ impl PyBPE {
    }
 }

+#[pymethods]
+impl PyBPE {
+    #[new]
+    #[args(kwargs = "**")]
+    fn new(
+        vocab: Option<&PyAny>,
+        merges: Option<&PyAny>,
+        kwargs: Option<&PyDict>,
+    ) -> PyResult<(Self, PyModel)> {
+        if (vocab.is_some() && merges.is_none()) || (vocab.is_none() && merges.is_some()) {
+            return Err(exceptions::PyValueError::new_err(
+                "`vocab` and `merges` must be both specified",
+            ));
+        }
+
+        let mut builder = BPE::builder();
+        if let (Some(vocab_any), Some(merges_any)) = (vocab, merges) {
+            if let (Ok(vocab), Ok(merges)) = (vocab_any.extract(), merges_any.extract()) {
+                builder = builder.vocab_and_merges(vocab, merges);
+            } else {
+                let vocab_filename: String = vocab_any.extract()?;
+                let merges_filename: String = merges_any.extract()?;
+                deprecation_warning(
+                    "0.9.0",
+                    "BPE.__init__ will not create from files anymore, try `BPE.from_files` instead",
+                )?;
+                builder = builder.files(vocab_filename, merges_filename);
+            }
+        }
+
+        PyBPE::with_builder(builder, kwargs)
+    }
+
+    #[staticmethod]
+    #[args(kwargs = "**")]
+    fn from_files(
+        vocab_filename: String,
+        merges_filename: String,
+        kwargs: Option<&PyDict>,
+    ) -> PyResult<(Self, PyModel)> {
+        let mut builder = BPE::builder();
+        builder = builder.files(vocab_filename, merges_filename);
+
+        PyBPE::with_builder(builder, kwargs)
+    }
+}
+
 /// WordPiece Model
 #[pyclass(extends=PyModel, module = "tokenizers.models", name=WordPiece)]
 pub struct PyWordPiece {}

-#[pymethods]
 impl PyWordPiece {
-    #[new]
-    #[args(kwargs = "**")]
-    fn new(vocab: Option<&str>, kwargs: Option<&PyDict>) -> PyResult<(Self, PyModel)> {
-        let mut builder = WordPiece::builder();
-
-        if let Some(vocab) = vocab {
-            builder = builder.files(vocab.to_owned());
-        }
-
+    fn with_builder(
+        mut builder: WordPieceBuilder,
+        kwargs: Option<&PyDict>,
+    ) -> PyResult<(Self, PyModel)> {
        if let Some(kwargs) = kwargs {
            for (key, val) in kwargs {
                let key: &str = key.extract()?;
@@ -234,14 +267,43 @@ impl PyWordPiece {
    }
 }

+#[pymethods]
+impl PyWordPiece {
+    #[new]
+    #[args(kwargs = "**")]
+    fn new(vocab: Option<&PyAny>, kwargs: Option<&PyDict>) -> PyResult<(Self, PyModel)> {
+        let mut builder = WordPiece::builder();
+
+        if let Some(vocab_any) = vocab {
+            #[allow(deprecated)]
+            if let Ok(vocab) = vocab_any.extract() {
+                builder = builder.vocab(vocab);
+            } else {
+                deprecation_warning(
+                    "0.9.0",
+                    "WordPiece.__init__ will not create from files anymore, try `WordPiece.from_file` instead",
+                )?;
+                let vocab_filename: String = vocab_any.extract()?;
+                builder = builder.files(vocab_filename);
+            }
+        }
+
+        PyWordPiece::with_builder(builder, kwargs)
+    }
+
+    #[staticmethod]
+    fn from_file(vocab: String, kwargs: Option<&PyDict>) -> PyResult<(Self, PyModel)> {
+        let mut builder = WordPiece::builder();
+        builder = builder.files(vocab);
+        PyWordPiece::with_builder(builder, kwargs)
+    }
+}
+
 #[pyclass(extends=PyModel, module = "tokenizers.models", name=WordLevel)]
 pub struct PyWordLevel {}

-#[pymethods]
 impl PyWordLevel {
-    #[new]
-    #[args(kwargs = "**")]
-    fn new(vocab: Option<&str>, kwargs: Option<&PyDict>) -> PyResult<(Self, PyModel)> {
+    fn get_unk(kwargs: Option<&PyDict>) -> PyResult<String> {
        let mut unk_token = String::from("<unk>");

        if let Some(kwargs) = kwargs {
@@ -253,15 +315,38 @@ impl PyWordLevel {
                }
            }
        }
-
-        if let Some(vocab) = vocab {
-            match WordLevel::from_files(vocab, unk_token) {
-                Err(e) => Err(exceptions::PyException::new_err(format!(
-                    "Error while initializing WordLevel: {}",
-                    e
-                ))),
-                Ok(model) => Ok((PyWordLevel {}, PyModel::new(Arc::new(model.into())))),
+        Ok(unk_token)
    }
+}
+
+#[pymethods]
+impl PyWordLevel {
+    #[new]
+    #[args(kwargs = "**")]
+    fn new(vocab: Option<&PyAny>, kwargs: Option<&PyDict>) -> PyResult<(Self, PyModel)> {
+        let unk_token = PyWordLevel::get_unk(kwargs)?;
+
+        if let Some(vocab_object) = vocab {
+            let model = if let Ok(vocab) = vocab_object.extract() {
+                WordLevel::builder()
+                    .vocab(vocab)
+                    .unk_token(unk_token)
+                    .build()
+            } else {
+                let filename: &str = vocab_object.extract()?;
+                deprecation_warning(
+                    "0.9.0",
+                    "WordLevel.__init__ will not create from files anymore, try `WordLevel.from_file` instead",
+                )?;
+                WordLevel::from_files(filename, unk_token).map_err(|e| {
+                    exceptions::PyException::new_err(format!(
+                        "Error while loading WordLevel: {}",
+                        e
+                    ))
+                })?
+            };
+
+            Ok((PyWordLevel {}, PyModel::new(Arc::new(model.into()))))
        } else {
            Ok((
                PyWordLevel {},
@@ -269,6 +354,18 @@ impl PyWordLevel {
            ))
        }
    }
+
+    #[staticmethod]
+    fn from_file(vocab_filename: &str, kwargs: Option<&PyDict>) -> PyResult<(Self, PyModel)> {
+        let unk_token = PyWordLevel::get_unk(kwargs)?;
+        let model = WordLevel::from_files(vocab_filename, unk_token).map_err(|e| {
+            exceptions::PyException::new_err(format!(
+                "Error while loading WordLevel from file: {}",
+                e
+            ))
+        })?;
+        Ok((PyWordLevel {}, PyModel::new(Arc::new(model.into()))))
+    }
 }

 #[pyclass(extends=PyModel, module = "tokenizers.models", name=Unigram)]
@@ -277,19 +374,22 @@ pub struct PyUnigram {}
 #[pymethods]
 impl PyUnigram {
    #[new]
-    fn new(vocab: Option<&str>) -> PyResult<(Self, PyModel)> {
-        match vocab {
-            Some(vocab) => match Unigram::load(vocab) {
-                Err(e) => Err(exceptions::PyException::new_err(format!(
-                    "Error while loading Unigram: {}",
-                    e
-                ))),
-                Ok(model) => Ok((PyUnigram {}, PyModel::new(Arc::new(model.into())))),
-            },
-            None => Ok((
+    fn new(vocab: Option<Vec<(String, f64)>>, unk_id: Option<usize>) -> PyResult<(Self, PyModel)> {
+        if vocab.is_some() && unk_id.is_none() || vocab.is_none() && unk_id.is_some() {}
+        match (vocab, unk_id) {
+            (Some(vocab), Some(unk_id)) => {
+                let model = Unigram::from(vocab, unk_id).map_err(|e| {
+                    exceptions::PyException::new_err(format!("Error while loading Unigram: {}", e))
+                })?;
+                Ok((PyUnigram {}, PyModel::new(Arc::new(model.into()))))
+            }
+            (None, None) => Ok((
                PyUnigram {},
                PyModel::new(Arc::new(Unigram::default().into())),
            )),
+            _ => Err(exceptions::PyValueError::new_err(
+                "`vocab` and `unk_id` must be both specified",
+            )),
        }
    }
 }
--- a/bindings/python/tests/bindings/test_models.py
+++ b/bindings/python/tests/bindings/test_models.py
@@ -10,12 +10,27 @@ class TestBPE:
    def test_instantiate(self, roberta_files):
        assert isinstance(BPE(), Model)
        assert isinstance(BPE(), BPE)
+
+        vocab = {"a": 0, "b": 1, "ab": 2}
+        merges = {(0, 1): (0, 2)}
+        assert isinstance(BPE(vocab, merges), Model)
+        with pytest.raises(ValueError, match="`vocab` and `merges` must be both specified"):
+            BPE(vocab=vocab)
+            BPE(merges=merges)
+
+        assert isinstance(pickle.loads(pickle.dumps(BPE(vocab, merges))), BPE,)
+
+        # Deprecated calls in 0.9
+        with pytest.deprecated_call():
            assert isinstance(BPE(roberta_files["vocab"], roberta_files["merges"]), Model)
+
        with pytest.raises(ValueError, match="`vocab` and `merges` must be both specified"):
            BPE(vocab=roberta_files["vocab"])
            BPE(merges=roberta_files["merges"])
+        with pytest.deprecated_call():
            assert isinstance(
-            pickle.loads(pickle.dumps(BPE(roberta_files["vocab"], roberta_files["merges"]))), BPE
+                pickle.loads(pickle.dumps(BPE(roberta_files["vocab"], roberta_files["merges"]))),
+                BPE,
            )


@@ -23,7 +38,16 @@ class TestWordPiece:
    def test_instantiate(self, bert_files):
        assert isinstance(WordPiece(), Model)
        assert isinstance(WordPiece(), WordPiece)
+
+        vocab = {"a": 0, "b": 1, "ab": 2}
+        assert isinstance(WordPiece(vocab), Model)
+        assert isinstance(WordPiece(vocab), WordPiece)
+        assert isinstance(pickle.loads(pickle.dumps(WordPiece(vocab))), WordPiece)
+
+        # Deprecated calls in 0.9
+        with pytest.deprecated_call():
            assert isinstance(WordPiece(bert_files["vocab"]), Model)
+        with pytest.deprecated_call():
            assert isinstance(pickle.loads(pickle.dumps(WordPiece(bert_files["vocab"]))), WordPiece)


@@ -31,7 +55,14 @@ class TestWordLevel:
    def test_instantiate(self, roberta_files):
        assert isinstance(WordLevel(), Model)
        assert isinstance(WordLevel(), WordLevel)
+
+        vocab = {"a": 0, "b": 1, "ab": 2}
+        assert isinstance(WordLevel(vocab), Model)
+        assert isinstance(WordLevel(vocab), WordLevel)
+
        # The WordLevel model expects a vocab.json using the same format as roberta
        # so we can just try to load with this file
+        with pytest.deprecated_call():
            assert isinstance(WordLevel(roberta_files["vocab"]), Model)
+        with pytest.deprecated_call():
            assert isinstance(WordLevel(roberta_files["vocab"]), WordLevel)
--- a/bindings/python/tests/bindings/test_processors.py
+++ b/bindings/python/tests/bindings/test_processors.py
@@ -1,3 +1,4 @@
+import pytest
 import pickle

 from ..utils import data_dir, roberta_files
@@ -21,7 +22,7 @@ class TestBertProcessing:
        assert isinstance(processor, PostProcessor)
        assert isinstance(processor, BertProcessing)
        assert isinstance(
-            pickle.loads(pickle.dumps(BertProcessing(("[SEP]", 0), ("[CLS]", 1)))), BertProcessing
+            pickle.loads(pickle.dumps(BertProcessing(("[SEP]", 0), ("[CLS]", 1)))), BertProcessing,
        )

    def test_processing(self):
@@ -66,6 +67,8 @@ class TestByteLevelProcessing:
        assert isinstance(pickle.loads(pickle.dumps(ByteLevel())), ByteLevel)

    def test_processing(self, roberta_files):
+        # Deprecated in 0.9
+        with pytest.deprecated_call():
            tokenizer = Tokenizer(BPE(roberta_files["vocab"], roberta_files["merges"]))
        tokenizer.pre_tokenizer = ByteLevelPreTokenizer(add_prefix_space=True)

--- a/bindings/python/tests/bindings/test_tokenizer.py
+++ b/bindings/python/tests/bindings/test_tokenizer.py
@@ -1,7 +1,12 @@
 import numpy as np
 import pickle
 import pytest
-from ..utils import data_dir, roberta_files, bert_files, multiprocessing_with_parallelism
+from ..utils import (
+    data_dir,
+    roberta_files,
+    bert_files,
+    multiprocessing_with_parallelism,
+)

 from tokenizers import AddedToken, Tokenizer, Encoding
 from tokenizers.models import Model, BPE, WordPiece
@@ -88,7 +93,11 @@ class TestTokenizer:
        added = tokenizer.add_tokens(["my", "name", "is", "john"])
        assert added == 4

-        tokens = [AddedToken("the"), AddedToken("quick", normalized=False), AddedToken()]
+        tokens = [
+            AddedToken("the"),
+            AddedToken("quick", normalized=False),
+            AddedToken(),
+        ]
        assert tokens[0].normalized == True
        added = tokenizer.add_tokens(tokens)
        assert added == 2
@@ -139,17 +148,36 @@ class TestTokenizer:
        assert len(output) == 2

    def test_encode_formats(self, bert_files):
+        with pytest.deprecated_call():
            tokenizer = BertWordPieceTokenizer(bert_files["vocab"])

        # Encode
        output = tokenizer.encode("my name is john")
        assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"]
        output = tokenizer.encode("my name is john", "pair")
-        assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]
+        assert output.tokens == [
+            "[CLS]",
+            "my",
+            "name",
+            "is",
+            "john",
+            "[SEP]",
+            "pair",
+            "[SEP]",
+        ]
        output = tokenizer.encode(["my", "name", "is", "john"], is_pretokenized=True)
        assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"]
        output = tokenizer.encode(["my", "name", "is", "john"], ["pair"], is_pretokenized=True)
-        assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]
+        assert output.tokens == [
+            "[CLS]",
+            "my",
+            "name",
+            "is",
+            "john",
+            "[SEP]",
+            "pair",
+            "[SEP]",
+        ]

        # Encode batch
        result_single = [
@@ -193,11 +221,17 @@ class TestTokenizer:
        # Lists
        test_single([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]], True)
        test_pair(
-            [(["My", "name", "is", "John"], ["pair"]), (["My", "name", "is", "Georges"], ["pair"])],
+            [
+                (["My", "name", "is", "John"], ["pair"]),
+                (["My", "name", "is", "Georges"], ["pair"]),
+            ],
            True,
        )
        test_pair(
-            [[["My", "name", "is", "John"], ["pair"]], [["My", "name", "is", "Georges"], ["pair"]]],
+            [
+                [["My", "name", "is", "John"], ["pair"]],
+                [["My", "name", "is", "Georges"], ["pair"]],
+            ],
            True,
        )

@@ -211,19 +245,27 @@ class TestTokenizer:
            True,
        )
        test_pair(
-            ((["My", "name", "is", "John"], ["pair"]), (["My", "name", "is", "Georges"], ["pair"])),
+            (
+                (["My", "name", "is", "John"], ["pair"]),
+                (["My", "name", "is", "Georges"], ["pair"]),
+            ),
            True,
        )

        # Numpy
-        test_single(np.array([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]]), True)
-        test_single(np.array((("My", "name", "is", "John"), ("My", "name", "is", "Georges"))), True)
+        test_single(
+            np.array([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]]), True,
+        )
+        test_single(
+            np.array((("My", "name", "is", "John"), ("My", "name", "is", "Georges"))), True,
+        )
        test_pair(
            np.array(
                [
                    [["My", "name", "is", "John"], ["pair"]],
                    [["My", "name", "is", "Georges"], ["pair"]],
-                ]
+                ],
+                dtype=object,
            ),
            True,
        )
@@ -232,7 +274,8 @@ class TestTokenizer:
                (
                    (("My", "name", "is", "John"), ("pair",)),
                    (("My", "name", "is", "Georges"), ("pair",)),
-                )
+                ),
+                dtype=object,
            ),
            True,
        )
@@ -249,6 +292,7 @@ class TestTokenizer:
            tokenizer.encode(["My", "name", "is", "John"], "pair", is_pretokenized=True)

    def test_encode_add_special_tokens(self, roberta_files):
+        with pytest.deprecated_call():
            tokenizer = Tokenizer(BPE(roberta_files["vocab"], roberta_files["merges"]))
        tokenizer.add_special_tokens(["<s>", "</s>"])

@@ -259,7 +303,14 @@ class TestTokenizer:

        # Can encode with special tokens
        output_with_specials = tokenizer.encode("My name is John", add_special_tokens=True)
-        assert output_with_specials.tokens == ["<s>", "ĠMy", "Ġname", "Ġis", "ĠJohn", "</s>"]
+        assert output_with_specials.tokens == [
+            "<s>",
+            "ĠMy",
+            "Ġname",
+            "Ġis",
+            "ĠJohn",
+            "</s>",
+        ]

        # Can encode without special tokens
        output_without_specials = tokenizer.encode("My name is John", add_special_tokens=False)
--- a/bindings/python/tests/implementations/test_bert_wordpiece.py
+++ b/bindings/python/tests/implementations/test_bert_wordpiece.py
@@ -1,16 +1,36 @@
+import pytest
+
 from ..utils import data_dir, bert_files, multiprocessing_with_parallelism
 from tokenizers import BertWordPieceTokenizer


 class TestBertWordPieceBPE:
    def test_basic_encode(self, bert_files):
-        tokenizer = BertWordPieceTokenizer(bert_files["vocab"])
+        tokenizer = BertWordPieceTokenizer.from_file(bert_files["vocab"])

        # Encode with special tokens by default
        output = tokenizer.encode("My name is John", "pair")
        assert output.ids == [101, 2026, 2171, 2003, 2198, 102, 3940, 102]
-        assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]
-        assert output.offsets == [(0, 0), (0, 2), (3, 7), (8, 10), (11, 15), (0, 0), (0, 4), (0, 0)]
+        assert output.tokens == [
+            "[CLS]",
+            "my",
+            "name",
+            "is",
+            "john",
+            "[SEP]",
+            "pair",
+            "[SEP]",
+        ]
+        assert output.offsets == [
+            (0, 0),
+            (0, 2),
+            (3, 7),
+            (8, 10),
+            (11, 15),
+            (0, 0),
+            (0, 4),
+            (0, 0),
+        ]
        assert output.type_ids == [0, 0, 0, 0, 0, 0, 1, 1]

        # Can encode without the special tokens
@@ -21,6 +41,6 @@ class TestBertWordPieceBPE:
        assert output.type_ids == [0, 0, 0, 0, 1]

    def test_multiprocessing_with_parallelism(self, bert_files):
-        tokenizer = BertWordPieceTokenizer(bert_files["vocab"])
+        tokenizer = BertWordPieceTokenizer.from_file(bert_files["vocab"])
        multiprocessing_with_parallelism(tokenizer, False)
        multiprocessing_with_parallelism(tokenizer, True)
--- a/bindings/python/tests/implementations/test_byte_level_bpe.py
+++ b/bindings/python/tests/implementations/test_byte_level_bpe.py
@@ -1,10 +1,14 @@
+import pytest
+
 from ..utils import data_dir, roberta_files, multiprocessing_with_parallelism
 from tokenizers import ByteLevelBPETokenizer


 class TestByteLevelBPE:
    def test_basic_encode(self, roberta_files):
-        tokenizer = ByteLevelBPETokenizer(roberta_files["vocab"], roberta_files["merges"])
+        tokenizer = ByteLevelBPETokenizer.from_files(
+            roberta_files["vocab"], roberta_files["merges"]
+        )
        output = tokenizer.encode("The quick brown fox jumps over the lazy dog")

        assert output.ids == [133, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335]
@@ -32,7 +36,7 @@ class TestByteLevelBPE:
        ]

    def test_add_prefix_space(self, roberta_files):
-        tokenizer = ByteLevelBPETokenizer(
+        tokenizer = ByteLevelBPETokenizer.from_files(
            roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True
        )
        output = tokenizer.encode("The quick brown fox jumps over the lazy dog")
@@ -62,8 +66,8 @@ class TestByteLevelBPE:
        ]

    def test_lowerspace(self, roberta_files):
-        tokenizer = ByteLevelBPETokenizer(
-            roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True
+        tokenizer = ByteLevelBPETokenizer.from_files(
+            roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True,
        )
        output = tokenizer.encode("The Quick Brown Fox Jumps Over The Lazy Dog")

@@ -81,6 +85,8 @@ class TestByteLevelBPE:
        ]

    def test_multiprocessing_with_parallelism(self, roberta_files):
-        tokenizer = ByteLevelBPETokenizer(roberta_files["vocab"], roberta_files["merges"])
+        tokenizer = ByteLevelBPETokenizer.from_files(
+            roberta_files["vocab"], roberta_files["merges"]
+        )
        multiprocessing_with_parallelism(tokenizer, False)
        multiprocessing_with_parallelism(tokenizer, True)
--- a/bindings/python/tests/implementations/test_char_bpe.py
+++ b/bindings/python/tests/implementations/test_char_bpe.py
@@ -1,10 +1,12 @@
+import pytest
+
 from ..utils import data_dir, openai_files, multiprocessing_with_parallelism
 from tokenizers import CharBPETokenizer


 class TestBertWordPieceBPE:
    def test_basic_encode(self, openai_files):
-        tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"])
+        tokenizer = CharBPETokenizer.from_files(openai_files["vocab"], openai_files["merges"])

        output = tokenizer.encode("My name is John", "pair")
        assert output.ids == [0, 253, 1362, 544, 0, 7, 12662, 2688]
@@ -31,7 +33,9 @@ class TestBertWordPieceBPE:
        assert output.type_ids == [0, 0, 0, 0, 0, 0, 0, 1]

    def test_lowercase(self, openai_files):
-        tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"], lowercase=True)
+        tokenizer = CharBPETokenizer.from_files(
+            openai_files["vocab"], openai_files["merges"], lowercase=True
+        )
        output = tokenizer.encode("My name is John", "pair", add_special_tokens=False)
        assert output.ids == [547, 1362, 544, 2476, 2688]
        assert output.tokens == ["my</w>", "name</w>", "is</w>", "john</w>", "pair</w>"]
@@ -39,11 +43,13 @@ class TestBertWordPieceBPE:
        assert output.type_ids == [0, 0, 0, 0, 1]

    def test_decoding(self, openai_files):
-        tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"], lowercase=True)
+        tokenizer = CharBPETokenizer.from_files(
+            openai_files["vocab"], openai_files["merges"], lowercase=True
+        )
        decoded = tokenizer.decode(tokenizer.encode("my name is john").ids)
        assert decoded == "my name is john"

    def test_multiprocessing_with_parallelism(self, openai_files):
-        tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"])
+        tokenizer = CharBPETokenizer.from_files(openai_files["vocab"], openai_files["merges"])
        multiprocessing_with_parallelism(tokenizer, False)
        multiprocessing_with_parallelism(tokenizer, True)
--- a/tokenizers/src/models/wordlevel/mod.rs
+++ b/tokenizers/src/models/wordlevel/mod.rs
@@ -101,7 +101,7 @@ impl std::fmt::Debug for WordLevel {
 }

 impl WordLevel {
-    fn builder() -> WordLevelBuilder {
+    pub fn builder() -> WordLevelBuilder {
        WordLevelBuilder::new()
    }