mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
from_files -> from_file everywhere
- read_files -> read_file - from_file pure rust impl in python bindings - Fix some typing in python binding - Added {BPE,WordLevel,WordPiece}.from_file tests.
This commit is contained in:
@ -156,14 +156,14 @@ pub fn bpe_init(mut cx: FunctionContext) -> JsResult<JsUndefined> {
|
||||
Ok(cx.undefined())
|
||||
}
|
||||
|
||||
/// bpe_from_files(vocab: String, merges: String, options: {
|
||||
/// bpe_from_file(vocab: String, merges: String, options: {
|
||||
/// cacheCapacity?: number,
|
||||
/// dropout?: number,
|
||||
/// unkToken?: String,
|
||||
/// continuingSubwordPrefix?: String,
|
||||
/// endOfWordSuffix?: String
|
||||
/// }, callback)
|
||||
pub fn bpe_from_files(mut cx: FunctionContext) -> JsResult<JsUndefined> {
|
||||
pub fn bpe_from_file(mut cx: FunctionContext) -> JsResult<JsUndefined> {
|
||||
let (options, callback) = match cx.extract_opt::<BpeOptions>(2) {
|
||||
// Options were there, and extracted
|
||||
Ok(Some(options)) => (options, cx.argument::<JsFunction>(3)?),
|
||||
@ -174,7 +174,7 @@ pub fn bpe_from_files(mut cx: FunctionContext) -> JsResult<JsUndefined> {
|
||||
};
|
||||
let vocab = cx.extract::<String>(0)?;
|
||||
let merges = cx.extract::<String>(1)?;
|
||||
let mut builder = tk::models::bpe::BPE::from_files(&vocab, &merges);
|
||||
let mut builder = tk::models::bpe::BPE::from_file(&vocab, &merges);
|
||||
|
||||
builder = options.apply_to_bpe_builder(builder);
|
||||
|
||||
@ -242,12 +242,12 @@ pub fn wordpiece_init(mut cx: FunctionContext) -> JsResult<JsUndefined> {
|
||||
Ok(cx.undefined())
|
||||
}
|
||||
|
||||
/// wordpiece_from_files(vocab: String, options: {
|
||||
/// wordpiece_from_file(vocab: String, options: {
|
||||
/// unkToken?: String = "[UNK]",
|
||||
/// maxInputCharsPerWord?: number = 100,
|
||||
/// continuingSubwordPrefix?: "##",
|
||||
/// }, callback)
|
||||
pub fn wordpiece_from_files(mut cx: FunctionContext) -> JsResult<JsUndefined> {
|
||||
pub fn wordpiece_from_file(mut cx: FunctionContext) -> JsResult<JsUndefined> {
|
||||
let (options, callback) = match cx.extract_opt::<WordPieceOptions>(1) {
|
||||
// Options were there, and extracted
|
||||
Ok(Some(options)) => (options, cx.argument::<JsFunction>(2)?),
|
||||
@ -257,7 +257,7 @@ pub fn wordpiece_from_files(mut cx: FunctionContext) -> JsResult<JsUndefined> {
|
||||
Err(_) => (WordPieceOptions::default(), cx.argument::<JsFunction>(1)?),
|
||||
};
|
||||
let vocab = cx.extract::<String>(0)?;
|
||||
let mut builder = tk::models::wordpiece::WordPiece::from_files(&vocab);
|
||||
let mut builder = tk::models::wordpiece::WordPiece::from_file(&vocab);
|
||||
builder = options.apply_to_wordpiece_builder(builder);
|
||||
let task = WordPieceFromFilesTask::new(builder);
|
||||
task.schedule(callback);
|
||||
@ -279,12 +279,12 @@ pub fn wordpiece_empty(mut cx: FunctionContext) -> JsResult<JsModel> {
|
||||
/// Register everything here
|
||||
pub fn register(m: &mut ModuleContext, prefix: &str) -> NeonResult<()> {
|
||||
m.export_function(&format!("{}_BPE_init", prefix), bpe_init)?;
|
||||
m.export_function(&format!("{}_BPE_from_files", prefix), bpe_from_files)?;
|
||||
m.export_function(&format!("{}_BPE_from_file", prefix), bpe_from_file)?;
|
||||
m.export_function(&format!("{}_BPE_empty", prefix), bpe_empty)?;
|
||||
m.export_function(&format!("{}_WordPiece_init", prefix), wordpiece_init)?;
|
||||
m.export_function(
|
||||
&format!("{}_WordPiece_from_files", prefix),
|
||||
wordpiece_from_files,
|
||||
&format!("{}_WordPiece_from_file", prefix),
|
||||
wordpiece_from_file,
|
||||
)?;
|
||||
m.export_function(&format!("{}_WordPiece_empty", prefix), wordpiece_empty)?;
|
||||
Ok(())
|
||||
|
@ -78,8 +78,8 @@ class ByteLevelBPETokenizer(BaseTokenizer):
|
||||
super().__init__(tokenizer, parameters)
|
||||
|
||||
@staticmethod
|
||||
def from_files(vocab_filename: str, merges_filename: str, **kwargs):
|
||||
vocab, merges = BPE.read_files(vocab_filename, merges_filename)
|
||||
def from_file(vocab_filename: str, merges_filename: str, **kwargs):
|
||||
vocab, merges = BPE.read_file(vocab_filename, merges_filename)
|
||||
return ByteLevelBPETokenizer(vocab, merges, **kwargs)
|
||||
|
||||
def train(
|
||||
|
@ -95,8 +95,8 @@ class CharBPETokenizer(BaseTokenizer):
|
||||
super().__init__(tokenizer, parameters)
|
||||
|
||||
@staticmethod
|
||||
def from_files(vocab_filename: str, merges_filename: str, **kwargs):
|
||||
vocab, merges = BPE.read_files(vocab_filename, merges_filename)
|
||||
def from_file(vocab_filename: str, merges_filename: str, **kwargs):
|
||||
vocab, merges = BPE.read_file(vocab_filename, merges_filename)
|
||||
return CharBPETokenizer(vocab, merges, **kwargs)
|
||||
|
||||
def train(
|
||||
|
@ -48,7 +48,7 @@ class SentencePieceBPETokenizer(BaseTokenizer):
|
||||
super().__init__(tokenizer, parameters)
|
||||
|
||||
@staticmethod
|
||||
def from_files(vocab_filename: str, merges_filename: str, **kwargs):
|
||||
def from_file(vocab_filename: str, merges_filename: str, **kwargs):
|
||||
vocab, merges = BPE.read_files(vocab_filename, merges_filename)
|
||||
return SentencePieceBPETokenizer(vocab, merges, **kwargs)
|
||||
|
||||
|
@ -29,7 +29,7 @@ class Model:
|
||||
class BPE(Model):
|
||||
"""BytePairEncoding model class
|
||||
|
||||
Instantiate a BPE Model from the given vocab and merges files.
|
||||
Instantiate a BPE Model from the given vocab and merges.
|
||||
|
||||
Args:
|
||||
vocab: ('`optional`) Dict[str, int]:
|
||||
@ -76,12 +76,19 @@ class BPE(Model):
|
||||
):
|
||||
pass
|
||||
@staticmethod
|
||||
def read_files(vocab_filename: str, merges_filename: str) -> Tuple[Vocab, Merges]:
|
||||
def read_file(vocab_filename: str, merges_filename: str) -> Tuple[Vocab, Merges]:
|
||||
pass
|
||||
@staticmethod
|
||||
def from_files(vocab_filename: str, merges_filename: str, **kwargs) -> BPE:
|
||||
vocab, merges = BPE.read_files(vocab_filename, merges_filename)
|
||||
return BPE(vocab, merges, **kwargs)
|
||||
def from_file(vocab_filename: str, merges_filename: str, **kwargs) -> BPE:
|
||||
"""
|
||||
Convenient method to intialize a BPE from files
|
||||
Roughly equivalent to
|
||||
|
||||
def from_file(vocab_filename, merges_filenames, **kwargs):
|
||||
vocab, merges = BPE.read_file(vocab_filename, merges_filename)
|
||||
return BPE(vocab, merges, **kwargs)
|
||||
"""
|
||||
pass
|
||||
|
||||
class WordPiece(Model):
|
||||
"""WordPiece model class
|
||||
@ -107,12 +114,19 @@ class WordPiece(Model):
|
||||
):
|
||||
pass
|
||||
@staticmethod
|
||||
def read_file(vocab_filename: str) -> Tuple[Vocab]:
|
||||
def read_file(vocab_filename: str) -> Vocab:
|
||||
pass
|
||||
@staticmethod
|
||||
def from_files(vocab_filename: str, **kwargs) -> WordPiece:
|
||||
vocab = WordPiece.read_files(vocab_filename)
|
||||
return WordPiece(vocab, **kwargs)
|
||||
def from_file(vocab_filename: str, **kwargs) -> WordPiece:
|
||||
"""
|
||||
Convenient method to intialize a WordPiece from file
|
||||
Roughly equivalent to
|
||||
|
||||
def from_file(vocab_filename, **kwargs):
|
||||
vocab, merges = WordPiece.read_file(vocab_filename)
|
||||
return WordPiece(vocab, **kwargs)
|
||||
"""
|
||||
pass
|
||||
|
||||
class WordLevel(Model):
|
||||
"""
|
||||
@ -131,12 +145,19 @@ class WordLevel(Model):
|
||||
def __init__(self, vocab: Optional[Union[str, Dict[str, int]]], unk_token: Optional[str]):
|
||||
pass
|
||||
@staticmethod
|
||||
def read_file(vocab_filename: str) -> Tuple[Vocab]:
|
||||
def read_file(vocab_filename: str) -> Vocab:
|
||||
pass
|
||||
@staticmethod
|
||||
def from_files(vocab_filename: str, **kwargs) -> WordLevel:
|
||||
vocab = WordLevel.read_files(vocab_filename)
|
||||
return WordLevel(vocab, **kwargs)
|
||||
def from_file(vocab_filename: str, **kwargs) -> WordLevelg:
|
||||
"""
|
||||
Convenient method to intialize a WordLevelg from file
|
||||
Roughly equivalent to
|
||||
|
||||
def from_file(vocab_filename, **kwargs):
|
||||
vocab, merges = WordLevelg.read_file(vocab_filename)
|
||||
return WordLevelg(vocab, **kwargs)
|
||||
"""
|
||||
pass
|
||||
|
||||
class Unigram(Model):
|
||||
"""UnigramEncoding model class
|
||||
|
@ -209,7 +209,7 @@ impl PyBPE {
|
||||
(PyVocab::Filename(vocab_filename), PyMerges::Filename(merges_filename)) => {
|
||||
deprecation_warning(
|
||||
"0.9.0",
|
||||
"BPE.__init__ will not create from files anymore, try `BPE.from_files` instead",
|
||||
"BPE.__init__ will not create from files anymore, try `BPE.from_file` instead",
|
||||
)?;
|
||||
builder =
|
||||
builder.files(vocab_filename.to_string(), merges_filename.to_string());
|
||||
@ -226,14 +226,35 @@ impl PyBPE {
|
||||
}
|
||||
|
||||
#[staticmethod]
|
||||
fn read_files(vocab_filename: &str, merges_filename: &str) -> PyResult<(Vocab, Merges)> {
|
||||
BPE::read_files(vocab_filename, merges_filename).map_err(|e| {
|
||||
fn read_file(vocab_filename: &str, merges_filename: &str) -> PyResult<(Vocab, Merges)> {
|
||||
BPE::read_file(vocab_filename, merges_filename).map_err(|e| {
|
||||
exceptions::PyValueError::new_err(format!(
|
||||
"Error while reading vocab&merges files: {}",
|
||||
e
|
||||
))
|
||||
})
|
||||
}
|
||||
|
||||
#[staticmethod]
|
||||
#[args(kwargs = "**")]
|
||||
fn from_file(
|
||||
py: Python,
|
||||
vocab_filename: &str,
|
||||
merges_filename: &str,
|
||||
kwargs: Option<&PyDict>,
|
||||
) -> PyResult<Py<Self>> {
|
||||
let (vocab, merges) = BPE::read_file(vocab_filename, merges_filename).map_err(|e| {
|
||||
exceptions::PyValueError::new_err(format!("Error while reading BPE files: {}", e))
|
||||
})?;
|
||||
Py::new(
|
||||
py,
|
||||
PyBPE::new(
|
||||
Some(PyVocab::Vocab(vocab)),
|
||||
Some(PyMerges::Merges(merges)),
|
||||
kwargs,
|
||||
)?,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// WordPiece Model
|
||||
@ -300,10 +321,19 @@ impl PyWordPiece {
|
||||
|
||||
#[staticmethod]
|
||||
fn read_file(vocab_filename: &str) -> PyResult<Vocab> {
|
||||
WordPiece::read_files(vocab_filename).map_err(|e| {
|
||||
WordPiece::read_file(vocab_filename).map_err(|e| {
|
||||
exceptions::PyValueError::new_err(format!("Error while reading WordPiece file: {}", e))
|
||||
})
|
||||
}
|
||||
|
||||
#[staticmethod]
|
||||
#[args(kwargs = "**")]
|
||||
fn from_file(py: Python, vocab_filename: &str, kwargs: Option<&PyDict>) -> PyResult<Py<Self>> {
|
||||
let vocab = WordPiece::read_file(vocab_filename).map_err(|e| {
|
||||
exceptions::PyValueError::new_err(format!("Error while reading WordPiece file: {}", e))
|
||||
})?;
|
||||
Py::new(py, PyWordPiece::new(Some(PyVocab::Vocab(vocab)), kwargs)?)
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=PyModel, module = "tokenizers.models", name=WordLevel)]
|
||||
@ -344,7 +374,7 @@ impl PyWordLevel {
|
||||
"0.9.0",
|
||||
"WordLevel.__init__ will not create from files anymore, try `WordLevel.from_file` instead",
|
||||
)?;
|
||||
WordLevel::from_files(vocab_filename, unk_token).map_err(|e| {
|
||||
WordLevel::from_file(vocab_filename, unk_token).map_err(|e| {
|
||||
exceptions::PyException::new_err(format!(
|
||||
"Error while loading WordLevel: {}",
|
||||
e
|
||||
@ -364,10 +394,19 @@ impl PyWordLevel {
|
||||
|
||||
#[staticmethod]
|
||||
fn read_file(vocab_filename: &str) -> PyResult<Vocab> {
|
||||
WordLevel::read_files(vocab_filename).map_err(|e| {
|
||||
WordLevel::read_file(vocab_filename).map_err(|e| {
|
||||
exceptions::PyValueError::new_err(format!("Error while reading WordLevel file: {}", e))
|
||||
})
|
||||
}
|
||||
|
||||
#[staticmethod]
|
||||
#[args(kwargs = "**")]
|
||||
fn from_file(py: Python, vocab_filename: &str, kwargs: Option<&PyDict>) -> PyResult<Py<Self>> {
|
||||
let vocab = WordLevel::read_file(vocab_filename).map_err(|e| {
|
||||
exceptions::PyValueError::new_err(format!("Error while reading WordLevel file: {}", e))
|
||||
})?;
|
||||
Py::new(py, PyWordLevel::new(Some(PyVocab::Vocab(vocab)), kwargs)?)
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=PyModel, module = "tokenizers.models", name=Unigram)]
|
||||
|
@ -14,6 +14,7 @@ class TestBPE:
|
||||
vocab = {"a": 0, "b": 1, "ab": 2}
|
||||
merges = {(0, 1): (0, 2)}
|
||||
assert isinstance(BPE(vocab, merges), Model)
|
||||
assert isinstance(BPE.from_file(roberta_files["vocab"], roberta_files["merges"]), BPE)
|
||||
with pytest.raises(ValueError, match="`vocab` and `merges` must be both specified"):
|
||||
BPE(vocab=vocab)
|
||||
BPE(merges=merges)
|
||||
@ -42,6 +43,7 @@ class TestWordPiece:
|
||||
vocab = {"a": 0, "b": 1, "ab": 2}
|
||||
assert isinstance(WordPiece(vocab), Model)
|
||||
assert isinstance(WordPiece(vocab), WordPiece)
|
||||
assert isinstance(WordPiece.from_file(bert_files["vocab"]), WordPiece)
|
||||
assert isinstance(pickle.loads(pickle.dumps(WordPiece(vocab))), WordPiece)
|
||||
|
||||
# Deprecated calls in 0.9
|
||||
@ -59,6 +61,7 @@ class TestWordLevel:
|
||||
vocab = {"a": 0, "b": 1, "ab": 2}
|
||||
assert isinstance(WordLevel(vocab), Model)
|
||||
assert isinstance(WordLevel(vocab), WordLevel)
|
||||
assert isinstance(WordLevel.from_file(roberta_files["vocab"]), WordLevel)
|
||||
|
||||
# The WordLevel model expects a vocab.json using the same format as roberta
|
||||
# so we can just try to load with this file
|
||||
|
@ -6,9 +6,7 @@ from tokenizers import ByteLevelBPETokenizer
|
||||
|
||||
class TestByteLevelBPE:
|
||||
def test_basic_encode(self, roberta_files):
|
||||
tokenizer = ByteLevelBPETokenizer.from_files(
|
||||
roberta_files["vocab"], roberta_files["merges"]
|
||||
)
|
||||
tokenizer = ByteLevelBPETokenizer.from_file(roberta_files["vocab"], roberta_files["merges"])
|
||||
output = tokenizer.encode("The quick brown fox jumps over the lazy dog")
|
||||
|
||||
assert output.ids == [133, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335]
|
||||
@ -36,7 +34,7 @@ class TestByteLevelBPE:
|
||||
]
|
||||
|
||||
def test_add_prefix_space(self, roberta_files):
|
||||
tokenizer = ByteLevelBPETokenizer.from_files(
|
||||
tokenizer = ByteLevelBPETokenizer.from_file(
|
||||
roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True
|
||||
)
|
||||
output = tokenizer.encode("The quick brown fox jumps over the lazy dog")
|
||||
@ -66,7 +64,7 @@ class TestByteLevelBPE:
|
||||
]
|
||||
|
||||
def test_lowerspace(self, roberta_files):
|
||||
tokenizer = ByteLevelBPETokenizer.from_files(
|
||||
tokenizer = ByteLevelBPETokenizer.from_file(
|
||||
roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True,
|
||||
)
|
||||
output = tokenizer.encode("The Quick Brown Fox Jumps Over The Lazy Dog")
|
||||
@ -85,8 +83,6 @@ class TestByteLevelBPE:
|
||||
]
|
||||
|
||||
def test_multiprocessing_with_parallelism(self, roberta_files):
|
||||
tokenizer = ByteLevelBPETokenizer.from_files(
|
||||
roberta_files["vocab"], roberta_files["merges"]
|
||||
)
|
||||
tokenizer = ByteLevelBPETokenizer.from_file(roberta_files["vocab"], roberta_files["merges"])
|
||||
multiprocessing_with_parallelism(tokenizer, False)
|
||||
multiprocessing_with_parallelism(tokenizer, True)
|
||||
|
@ -6,7 +6,7 @@ from tokenizers import CharBPETokenizer
|
||||
|
||||
class TestBertWordPieceBPE:
|
||||
def test_basic_encode(self, openai_files):
|
||||
tokenizer = CharBPETokenizer.from_files(openai_files["vocab"], openai_files["merges"])
|
||||
tokenizer = CharBPETokenizer.from_file(openai_files["vocab"], openai_files["merges"])
|
||||
|
||||
output = tokenizer.encode("My name is John", "pair")
|
||||
assert output.ids == [0, 253, 1362, 544, 0, 7, 12662, 2688]
|
||||
@ -33,7 +33,7 @@ class TestBertWordPieceBPE:
|
||||
assert output.type_ids == [0, 0, 0, 0, 0, 0, 0, 1]
|
||||
|
||||
def test_lowercase(self, openai_files):
|
||||
tokenizer = CharBPETokenizer.from_files(
|
||||
tokenizer = CharBPETokenizer.from_file(
|
||||
openai_files["vocab"], openai_files["merges"], lowercase=True
|
||||
)
|
||||
output = tokenizer.encode("My name is John", "pair", add_special_tokens=False)
|
||||
@ -43,13 +43,13 @@ class TestBertWordPieceBPE:
|
||||
assert output.type_ids == [0, 0, 0, 0, 1]
|
||||
|
||||
def test_decoding(self, openai_files):
|
||||
tokenizer = CharBPETokenizer.from_files(
|
||||
tokenizer = CharBPETokenizer.from_file(
|
||||
openai_files["vocab"], openai_files["merges"], lowercase=True
|
||||
)
|
||||
decoded = tokenizer.decode(tokenizer.encode("my name is john").ids)
|
||||
assert decoded == "my name is john"
|
||||
|
||||
def test_multiprocessing_with_parallelism(self, openai_files):
|
||||
tokenizer = CharBPETokenizer.from_files(openai_files["vocab"], openai_files["merges"])
|
||||
tokenizer = CharBPETokenizer.from_file(openai_files["vocab"], openai_files["merges"])
|
||||
multiprocessing_with_parallelism(tokenizer, False)
|
||||
multiprocessing_with_parallelism(tokenizer, True)
|
||||
|
@ -45,7 +45,7 @@ fn create_bert_tokenizer(wp: WordPiece) -> BertTokenizer {
|
||||
}
|
||||
|
||||
pub fn bench_bert(c: &mut Criterion) {
|
||||
let wp = WordPiece::from_files("data/bert-base-uncased-vocab.txt")
|
||||
let wp = WordPiece::from_file("data/bert-base-uncased-vocab.txt")
|
||||
.build()
|
||||
.unwrap();
|
||||
let tokenizer = create_bert_tokenizer(wp);
|
||||
|
@ -30,7 +30,7 @@ fn create_gpt2_tokenizer(bpe: BPE) -> Tokenizer {
|
||||
}
|
||||
|
||||
fn bench_gpt2(c: &mut Criterion) {
|
||||
let bpe = BPE::from_files("data/gpt2-vocab.json", "data/gpt2-merges.txt")
|
||||
let bpe = BPE::from_file("data/gpt2-vocab.json", "data/gpt2-merges.txt")
|
||||
.build()
|
||||
.unwrap();
|
||||
let tokenizer = create_gpt2_tokenizer(bpe);
|
||||
@ -53,7 +53,7 @@ fn bench_gpt2(c: &mut Criterion) {
|
||||
b.iter_custom(|iters| iter_bench_encode_batch(iters, tokenizer.deref(), &batches))
|
||||
});
|
||||
|
||||
let bpe = BPE::from_files("data/gpt2-vocab.json", "data/gpt2-merges.txt")
|
||||
let bpe = BPE::from_file("data/gpt2-vocab.json", "data/gpt2-merges.txt")
|
||||
.cache_capacity(0)
|
||||
.build()
|
||||
.unwrap();
|
||||
|
@ -17,7 +17,7 @@ fn shell(matches: &ArgMatches) -> Result<()> {
|
||||
.value_of("merges")
|
||||
.expect("Must give a merges.txt file");
|
||||
|
||||
let bpe = BPE::from_files(vocab, merges).build()?;
|
||||
let bpe = BPE::from_file(vocab, merges).build()?;
|
||||
let mut tokenizer = Tokenizer::new(bpe);
|
||||
tokenizer
|
||||
.with_pre_tokenizer(ByteLevel::default())
|
||||
|
@ -27,7 +27,7 @@
|
||||
//! use tokenizers::models::bpe::BPE;
|
||||
//!
|
||||
//! fn main() -> Result<()> {
|
||||
//! let bpe_builder = BPE::from_files("./path/to/vocab.json", "./path/to/merges.txt");
|
||||
//! let bpe_builder = BPE::from_file("./path/to/vocab.json", "./path/to/merges.txt");
|
||||
//! let bpe = bpe_builder
|
||||
//! .dropout(0.1)
|
||||
//! .unk_token("[UNK]".into())
|
||||
|
@ -117,7 +117,7 @@ impl BpeBuilder {
|
||||
|
||||
// Read files if necessary
|
||||
if let Some((vocab, merges)) = self.config.files {
|
||||
let (v, m) = BPE::read_files(&vocab, &merges)?;
|
||||
let (v, m) = BPE::read_file(&vocab, &merges)?;
|
||||
self.config.vocab = v;
|
||||
self.config.merges = m;
|
||||
}
|
||||
@ -258,12 +258,12 @@ impl BPE {
|
||||
}
|
||||
|
||||
/// Initialize a BpeBuilder model from vocab and merges files
|
||||
pub fn from_files(vocab: &str, merges: &str) -> BpeBuilder {
|
||||
pub fn from_file(vocab: &str, merges: &str) -> BpeBuilder {
|
||||
BPE::builder().files(vocab.to_owned(), merges.to_owned())
|
||||
}
|
||||
|
||||
/// Read the given files to extract the vocab and merges
|
||||
pub fn read_files(vocab: &str, merges: &str) -> Result<(Vocab, Merges)> {
|
||||
pub fn read_file(vocab: &str, merges: &str) -> Result<(Vocab, Merges)> {
|
||||
// Read vocab.json
|
||||
let vocab_file = File::open(vocab)?;
|
||||
let mut vocab_file = BufReader::new(vocab_file);
|
||||
@ -627,8 +627,8 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
// Ensure `BPE::from_files` works as expected.
|
||||
fn test_bpe_from_files() {
|
||||
// Ensure `BPE::from_file` works as expected.
|
||||
fn test_bpe_from_file() {
|
||||
// Set up vocab file.
|
||||
let mut vocab_file = NamedTempFile::new().unwrap();
|
||||
vocab_file
|
||||
@ -640,7 +640,7 @@ mod tests {
|
||||
merges_file.write_all(b"#version: 0.2\na b").unwrap();
|
||||
|
||||
// Make sure we can instantiate a BPE model from the files.
|
||||
let builder = BPE::from_files(
|
||||
let builder = BPE::from_file(
|
||||
vocab_file.path().to_str().unwrap(),
|
||||
merges_file.path().to_str().unwrap(),
|
||||
);
|
||||
@ -658,7 +658,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
// Ensure `MergeTokenOutOfVocabulary` error is returned when it should be.
|
||||
fn test_bpe_from_files_merge_token_oov() {
|
||||
fn test_bpe_from_file_merge_token_oov() {
|
||||
// Set up vocab file.
|
||||
let mut vocab_file = NamedTempFile::new().unwrap();
|
||||
vocab_file
|
||||
@ -669,8 +669,8 @@ mod tests {
|
||||
let mut merges_file = NamedTempFile::new().unwrap();
|
||||
merges_file.write_all(b"#version: 0.2\na b\na d").unwrap();
|
||||
|
||||
// Ensure the result of BPE::from_files is a MergeTokenOutOfVocabulary error.
|
||||
match BPE::from_files(
|
||||
// Ensure the result of BPE::from_file is a MergeTokenOutOfVocabulary error.
|
||||
match BPE::from_file(
|
||||
vocab_file.path().to_str().unwrap(),
|
||||
merges_file.path().to_str().unwrap(),
|
||||
)
|
||||
@ -689,7 +689,7 @@ mod tests {
|
||||
#[test]
|
||||
// Ensure `BadMerges` error is returned when there is an invalid line in the
|
||||
// merges.txt file.
|
||||
fn test_bpe_from_files_bad_merges() {
|
||||
fn test_bpe_from_file_bad_merges() {
|
||||
// Set up vocab file.
|
||||
let mut vocab_file = NamedTempFile::new().unwrap();
|
||||
vocab_file
|
||||
@ -700,8 +700,8 @@ mod tests {
|
||||
let mut merges_file = NamedTempFile::new().unwrap();
|
||||
merges_file.write_all(b"#version: 0.2\na b\nc").unwrap();
|
||||
|
||||
// Ensure the result of BPE::from_files is a BadMerges error.
|
||||
match BPE::from_files(
|
||||
// Ensure the result of BPE::from_file is a BadMerges error.
|
||||
match BPE::from_file(
|
||||
vocab_file.path().to_str().unwrap(),
|
||||
merges_file.path().to_str().unwrap(),
|
||||
)
|
||||
|
@ -107,7 +107,7 @@ impl WordLevel {
|
||||
WordLevelBuilder::new()
|
||||
}
|
||||
|
||||
pub fn read_files(vocab_path: &str) -> Result<Vocab> {
|
||||
pub fn read_file(vocab_path: &str) -> Result<Vocab> {
|
||||
let vocab_file = File::open(vocab_path)?;
|
||||
let mut vocab_file = BufReader::new(vocab_file);
|
||||
let mut buffer = String::new();
|
||||
@ -131,8 +131,8 @@ impl WordLevel {
|
||||
}
|
||||
|
||||
/// Initialize a WordLevel model from vocab and merges file.
|
||||
pub fn from_files(vocab_path: &str, unk_token: String) -> Result<WordLevel> {
|
||||
let vocab = WordLevel::read_files(vocab_path)?;
|
||||
pub fn from_file(vocab_path: &str, unk_token: String) -> Result<WordLevel> {
|
||||
let vocab = WordLevel::read_file(vocab_path)?;
|
||||
Ok(Self::builder().vocab(vocab).unk_token(unk_token).build())
|
||||
}
|
||||
}
|
||||
|
@ -103,7 +103,7 @@ impl WordPieceBuilder {
|
||||
/// Contructs a `WordPiece` model that uses the `WordPieceBuilder`'s configuration.
|
||||
pub fn build(mut self) -> Result<WordPiece> {
|
||||
if let Some(vocab) = self.config.files {
|
||||
self.config.vocab = WordPiece::read_files(&vocab)?;
|
||||
self.config.vocab = WordPiece::read_file(&vocab)?;
|
||||
}
|
||||
|
||||
let vocab_r = self
|
||||
@ -165,7 +165,7 @@ impl WordPiece {
|
||||
}
|
||||
|
||||
/// Read the given files to extract the vocab
|
||||
pub fn read_files(vocab: &str) -> Result<Vocab> {
|
||||
pub fn read_file(vocab: &str) -> Result<Vocab> {
|
||||
let file = File::open(vocab)?;
|
||||
let file = BufReader::new(file);
|
||||
|
||||
@ -179,7 +179,7 @@ impl WordPiece {
|
||||
}
|
||||
|
||||
/// Initialize a `WordPiece` model from a vocab mapping file.
|
||||
pub fn from_files(vocab: &str) -> WordPieceBuilder {
|
||||
pub fn from_file(vocab: &str) -> WordPieceBuilder {
|
||||
WordPiece::builder().files(vocab.to_owned())
|
||||
}
|
||||
|
||||
|
@ -14,7 +14,7 @@ pub fn get_empty() -> Tokenizer {
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub fn get_byte_level_bpe() -> BPE {
|
||||
BPE::from_files("data/gpt2-vocab.json", "data/gpt2-merges.txt")
|
||||
BPE::from_file("data/gpt2-vocab.json", "data/gpt2-merges.txt")
|
||||
.build()
|
||||
.expect("Files not found, run `make test` to download these files")
|
||||
}
|
||||
@ -32,7 +32,7 @@ pub fn get_byte_level(add_prefix_space: bool, trim_offsets: bool) -> Tokenizer {
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub fn get_bert_wordpiece() -> WordPiece {
|
||||
WordPiece::from_files("data/bert-base-uncased-vocab.txt")
|
||||
WordPiece::from_file("data/bert-base-uncased-vocab.txt")
|
||||
.build()
|
||||
.expect("Files not found, run `make test` to download these files")
|
||||
}
|
||||
|
@ -36,7 +36,7 @@ fn wordpiece_serde() {
|
||||
|
||||
#[test]
|
||||
fn wordlevel_serde() {
|
||||
let wordlevel = WordLevel::from_files("data/gpt2-vocab.json", "<unk>".into()).unwrap();
|
||||
let wordlevel = WordLevel::from_file("data/gpt2-vocab.json", "<unk>".into()).unwrap();
|
||||
let ser = serde_json::to_string(&wordlevel).unwrap();
|
||||
let de = serde_json::from_str(&ser).unwrap();
|
||||
assert_eq!(wordlevel, de);
|
||||
|
Reference in New Issue
Block a user