diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs index b2d69f38..cce618d0 100644 --- a/bindings/python/src/tokenizer.rs +++ b/bindings/python/src/tokenizer.rs @@ -203,6 +203,32 @@ impl Tokenizer { } } + #[staticmethod] + fn from_str(s: &str) -> PyResult { + let tokenizer: PyResult = ToPyResult(s.parse()).into(); + Ok(Self { + tokenizer: tokenizer?, + }) + } + + #[staticmethod] + fn from_file(path: &str) -> PyResult { + let tokenizer: PyResult<_> = ToPyResult(tk::tokenizer::Tokenizer::from_file(path)).into(); + Ok(Self { + tokenizer: tokenizer?, + }) + } + + #[args(pretty = false)] + fn to_str(&self, pretty: bool) -> PyResult { + ToPyResult(self.tokenizer.to_string(pretty)).into() + } + + #[args(pretty = false)] + fn save(&self, path: &str, pretty: bool) -> PyResult<()> { + ToPyResult(self.tokenizer.save(path, pretty)).into() + } + fn num_special_tokens_to_add(&self, is_pair: bool) -> PyResult { Ok(self .tokenizer @@ -604,9 +630,4 @@ impl Tokenizer { )) } } - - #[args(pretty = false)] - fn save(&self, path: &str, pretty: bool) -> PyResult<()> { - ToPyResult(self.tokenizer.save(path, pretty)).into() - } } diff --git a/bindings/python/tokenizers/__init__.pyi b/bindings/python/tokenizers/__init__.pyi index e020dde2..7fa34405 100644 --- a/bindings/python/tokenizers/__init__.pyi +++ b/bindings/python/tokenizers/__init__.pyi @@ -252,6 +252,49 @@ class Tokenizer: Tokenizer """ pass + @staticmethod + def from_str(s: str) -> Tokenizer: + """ Instantiate a new Tokenizer from the given JSON string + + Args: + s: str: + A JSON string representation of the Tokenizer + + Returns: + Tokenizer + """ + pass + @staticmethod + def from_file(path: str) -> Tokenizer: + """ Instantiate a new Tokenizer from the given file + + Args: + path: str: + Path to a file containing a Tokenizer + + Returns: + Tokenizer + """ + pass + def to_str(self, pretty: bool = False) -> str: + """ Get a serialized JSON version of the Tokenizer as a str + + Args: + pretty: bool: + Whether the JSON string should be prettified + + Returns: + str + """ + pass + def save(self, path: str, pretty: bool = False): + """ Save the Tokenizer as JSON to the given path + + Args: + pretty: bool: + Whether the JSON string should be prettified + """ + pass @property def model(self) -> Model: """ Get the model in use with this Tokenizer """ diff --git a/bindings/python/tokenizers/implementations/base_tokenizer.py b/bindings/python/tokenizers/implementations/base_tokenizer.py index 99eedda8..7d6fe87e 100644 --- a/bindings/python/tokenizers/implementations/base_tokenizer.py +++ b/bindings/python/tokenizers/implementations/base_tokenizer.py @@ -295,7 +295,7 @@ class BaseTokenizer: """ return self._tokenizer.model.save(directory, name=name) - def save(self, path: str, pretty: bool = True): + def save(self, path: str, pretty: bool = False): """ Save the current Tokenizer at the given path Args: @@ -304,6 +304,18 @@ class BaseTokenizer: """ return self._tokenizer.save(path, pretty) + def to_str(self, pretty: bool = False): + """ Get a serialized JSON version of the Tokenizer as a str + + Args: + pretty: bool: + Whether the JSON string should be prettified + + Returns: + str + """ + return self._tokenizer.to_str(pretty) + def post_process( self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True ) -> Encoding: