Python - Add to/from str and files for Tokenizer

This commit is contained in:
Anthony MOI
2020-05-20 17:04:54 -04:00
parent 78db4c43dc
commit 85c7c94809
3 changed files with 82 additions and 6 deletions

View File

@@ -203,6 +203,32 @@ impl Tokenizer {
} }
} }
#[staticmethod]
fn from_str(s: &str) -> PyResult<Self> {
let tokenizer: PyResult<tk::tokenizer::Tokenizer> = ToPyResult(s.parse()).into();
Ok(Self {
tokenizer: tokenizer?,
})
}
#[staticmethod]
fn from_file(path: &str) -> PyResult<Self> {
let tokenizer: PyResult<_> = ToPyResult(tk::tokenizer::Tokenizer::from_file(path)).into();
Ok(Self {
tokenizer: tokenizer?,
})
}
#[args(pretty = false)]
fn to_str(&self, pretty: bool) -> PyResult<String> {
ToPyResult(self.tokenizer.to_string(pretty)).into()
}
#[args(pretty = false)]
fn save(&self, path: &str, pretty: bool) -> PyResult<()> {
ToPyResult(self.tokenizer.save(path, pretty)).into()
}
fn num_special_tokens_to_add(&self, is_pair: bool) -> PyResult<usize> { fn num_special_tokens_to_add(&self, is_pair: bool) -> PyResult<usize> {
Ok(self Ok(self
.tokenizer .tokenizer
@@ -604,9 +630,4 @@ impl Tokenizer {
)) ))
} }
} }
#[args(pretty = false)]
fn save(&self, path: &str, pretty: bool) -> PyResult<()> {
ToPyResult(self.tokenizer.save(path, pretty)).into()
}
} }

View File

@@ -252,6 +252,49 @@ class Tokenizer:
Tokenizer Tokenizer
""" """
pass pass
@staticmethod
def from_str(s: str) -> Tokenizer:
""" Instantiate a new Tokenizer from the given JSON string
Args:
s: str:
A JSON string representation of the Tokenizer
Returns:
Tokenizer
"""
pass
@staticmethod
def from_file(path: str) -> Tokenizer:
""" Instantiate a new Tokenizer from the given file
Args:
path: str:
Path to a file containing a Tokenizer
Returns:
Tokenizer
"""
pass
def to_str(self, pretty: bool = False) -> str:
""" Get a serialized JSON version of the Tokenizer as a str
Args:
pretty: bool:
Whether the JSON string should be prettified
Returns:
str
"""
pass
def save(self, path: str, pretty: bool = False):
""" Save the Tokenizer as JSON to the given path
Args:
pretty: bool:
Whether the JSON string should be prettified
"""
pass
@property @property
def model(self) -> Model: def model(self) -> Model:
""" Get the model in use with this Tokenizer """ """ Get the model in use with this Tokenizer """

View File

@@ -295,7 +295,7 @@ class BaseTokenizer:
""" """
return self._tokenizer.model.save(directory, name=name) return self._tokenizer.model.save(directory, name=name)
def save(self, path: str, pretty: bool = True): def save(self, path: str, pretty: bool = False):
""" Save the current Tokenizer at the given path """ Save the current Tokenizer at the given path
Args: Args:
@@ -304,6 +304,18 @@ class BaseTokenizer:
""" """
return self._tokenizer.save(path, pretty) return self._tokenizer.save(path, pretty)
def to_str(self, pretty: bool = False):
""" Get a serialized JSON version of the Tokenizer as a str
Args:
pretty: bool:
Whether the JSON string should be prettified
Returns:
str
"""
return self._tokenizer.to_str(pretty)
def post_process( def post_process(
self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True
) -> Encoding: ) -> Encoding: