mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-09 22:28:29 +00:00
Temp work to make the APIs uniform (build from memory by default).
This commit is contained in:
@@ -2,10 +2,12 @@ const native = require("./native");
|
||||
|
||||
module.exports = {
|
||||
BPE: {
|
||||
init: native.models_BPE_init,
|
||||
fromFiles: native.models_BPE_from_files,
|
||||
empty: native.models_BPE_empty,
|
||||
},
|
||||
WordPiece: {
|
||||
init: native.models_WordPiece_init,
|
||||
fromFiles: native.models_WordPiece_from_files,
|
||||
empty: native.models_WordPiece_empty,
|
||||
},
|
||||
|
||||
@@ -128,17 +128,14 @@ impl BpeOptions {
|
||||
}
|
||||
}
|
||||
|
||||
/// bpe_from_files(vocab: String, merges: String, options: {
|
||||
/// bpe_init(vocab: Map<String, u32>, merges: Map<(u32, u32), (u32, u32)>, options: {
|
||||
/// cacheCapacity?: number,
|
||||
/// dropout?: number,
|
||||
/// unkToken?: String,
|
||||
/// continuingSubwordPrefix?: String,
|
||||
/// endOfWordSuffix?: String
|
||||
/// }, callback)
|
||||
pub fn bpe_from_files(mut cx: FunctionContext) -> JsResult<JsUndefined> {
|
||||
let vocab = cx.extract::<String>(0)?;
|
||||
let merges = cx.extract::<String>(1)?;
|
||||
|
||||
pub fn bpe_init(mut cx: FunctionContext) -> JsResult<JsUndefined> {
|
||||
let (options, callback) = match cx.extract_opt::<BpeOptions>(2) {
|
||||
// Options were there, and extracted
|
||||
Ok(Some(options)) => (options, cx.argument::<JsFunction>(3)?),
|
||||
@@ -147,8 +144,38 @@ pub fn bpe_from_files(mut cx: FunctionContext) -> JsResult<JsUndefined> {
|
||||
// Options not specified, callback instead
|
||||
Err(_) => (BpeOptions::default(), cx.argument::<JsFunction>(2)?),
|
||||
};
|
||||
let vocab = cx.extract::<HashMap<String, u32>>(0)?;
|
||||
let merges = cx.extract::<HashMap<(u32, u32), (u32, u32)>>(1)?;
|
||||
|
||||
let mut builder = tk::models::bpe::BPE::builder().vocab_and_merges(vocab, merges);
|
||||
|
||||
builder = options.apply_to_bpe_builder(builder);
|
||||
|
||||
let task = BPEFromFilesTask::new(builder);
|
||||
task.schedule(callback);
|
||||
Ok(cx.undefined())
|
||||
}
|
||||
|
||||
/// bpe_from_files(vocab: String, merges: String, options: {
|
||||
/// cacheCapacity?: number,
|
||||
/// dropout?: number,
|
||||
/// unkToken?: String,
|
||||
/// continuingSubwordPrefix?: String,
|
||||
/// endOfWordSuffix?: String
|
||||
/// }, callback)
|
||||
pub fn bpe_from_files(mut cx: FunctionContext) -> JsResult<JsUndefined> {
|
||||
let (options, callback) = match cx.extract_opt::<BpeOptions>(2) {
|
||||
// Options were there, and extracted
|
||||
Ok(Some(options)) => (options, cx.argument::<JsFunction>(3)?),
|
||||
// Options were undefined or null
|
||||
Ok(None) => (BpeOptions::default(), cx.argument::<JsFunction>(3)?),
|
||||
// Options not specified, callback instead
|
||||
Err(_) => (BpeOptions::default(), cx.argument::<JsFunction>(2)?),
|
||||
};
|
||||
let vocab = cx.extract::<String>(0)?;
|
||||
let merges = cx.extract::<String>(1)?;
|
||||
let mut builder = tk::models::bpe::BPE::from_files(&vocab, &merges);
|
||||
|
||||
builder = options.apply_to_bpe_builder(builder);
|
||||
|
||||
let task = BPEFromFilesTask::new(builder);
|
||||
@@ -190,14 +217,12 @@ impl WordPieceOptions {
|
||||
}
|
||||
}
|
||||
|
||||
/// wordpiece_from_files(vocab: String, options: {
|
||||
/// wordpiece_init(vocab: Map<String, u32>, options: {
|
||||
/// unkToken?: String = "[UNK]",
|
||||
/// maxInputCharsPerWord?: number = 100,
|
||||
/// continuingSubwordPrefix?: "##",
|
||||
/// }, callback)
|
||||
pub fn wordpiece_from_files(mut cx: FunctionContext) -> JsResult<JsUndefined> {
|
||||
let vocab = cx.extract::<String>(0)?;
|
||||
|
||||
pub fn wordpiece_init(mut cx: FunctionContext) -> JsResult<JsUndefined> {
|
||||
let (options, callback) = match cx.extract_opt::<WordPieceOptions>(1) {
|
||||
// Options were there, and extracted
|
||||
Ok(Some(options)) => (options, cx.argument::<JsFunction>(2)?),
|
||||
@@ -207,11 +232,36 @@ pub fn wordpiece_from_files(mut cx: FunctionContext) -> JsResult<JsUndefined> {
|
||||
Err(_) => (WordPieceOptions::default(), cx.argument::<JsFunction>(1)?),
|
||||
};
|
||||
|
||||
let mut builder = tk::models::wordpiece::WordPiece::from_files(&vocab);
|
||||
builder = options.apply_to_wordpiece_builder(builder);
|
||||
let vocab = cx.extract::<HashMap<String, u32>>(0)?;
|
||||
|
||||
let mut builder = tk::models::wordpiece::WordPiece::builder().vocab(vocab);
|
||||
builder = options.apply_to_wordpiece_builder(builder);
|
||||
let task = WordPieceFromFilesTask::new(builder);
|
||||
task.schedule(callback);
|
||||
|
||||
Ok(cx.undefined())
|
||||
}
|
||||
|
||||
/// wordpiece_from_files(vocab: String, options: {
|
||||
/// unkToken?: String = "[UNK]",
|
||||
/// maxInputCharsPerWord?: number = 100,
|
||||
/// continuingSubwordPrefix?: "##",
|
||||
/// }, callback)
|
||||
pub fn wordpiece_from_files(mut cx: FunctionContext) -> JsResult<JsUndefined> {
|
||||
let (options, callback) = match cx.extract_opt::<WordPieceOptions>(1) {
|
||||
// Options were there, and extracted
|
||||
Ok(Some(options)) => (options, cx.argument::<JsFunction>(2)?),
|
||||
// Options were undefined or null
|
||||
Ok(None) => (WordPieceOptions::default(), cx.argument::<JsFunction>(2)?),
|
||||
// Options not specified, callback instead
|
||||
Err(_) => (WordPieceOptions::default(), cx.argument::<JsFunction>(1)?),
|
||||
};
|
||||
let vocab = cx.extract::<String>(0)?;
|
||||
let mut builder = tk::models::wordpiece::WordPiece::from_files(&vocab);
|
||||
builder = options.apply_to_wordpiece_builder(builder);
|
||||
let task = WordPieceFromFilesTask::new(builder);
|
||||
task.schedule(callback);
|
||||
|
||||
Ok(cx.undefined())
|
||||
}
|
||||
|
||||
@@ -228,8 +278,10 @@ pub fn wordpiece_empty(mut cx: FunctionContext) -> JsResult<JsModel> {
|
||||
|
||||
/// Register everything here
|
||||
pub fn register(m: &mut ModuleContext, prefix: &str) -> NeonResult<()> {
|
||||
m.export_function(&format!("{}_BPE_init", prefix), bpe_init)?;
|
||||
m.export_function(&format!("{}_BPE_from_files", prefix), bpe_from_files)?;
|
||||
m.export_function(&format!("{}_BPE_empty", prefix), bpe_empty)?;
|
||||
m.export_function(&format!("{}_WordPiece_init", prefix), wordpiece_init)?;
|
||||
m.export_function(
|
||||
&format!("{}_WordPiece_from_files", prefix),
|
||||
wordpiece_from_files,
|
||||
|
||||
@@ -5,7 +5,7 @@ from tokenizers.pre_tokenizers import BertPreTokenizer
|
||||
from tokenizers.processors import BertProcessing
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional, List, Union
|
||||
from typing import Optional, List, Union, Dict
|
||||
|
||||
|
||||
class BertWordPieceTokenizer(BaseTokenizer):
|
||||
@@ -13,7 +13,7 @@ class BertWordPieceTokenizer(BaseTokenizer):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file: Optional[str] = None,
|
||||
vocab: Optional[Union[str, Dict[str, int]]] = None,
|
||||
unk_token: Union[str, AddedToken] = "[UNK]",
|
||||
sep_token: Union[str, AddedToken] = "[SEP]",
|
||||
cls_token: Union[str, AddedToken] = "[CLS]",
|
||||
@@ -26,8 +26,8 @@ class BertWordPieceTokenizer(BaseTokenizer):
|
||||
wordpieces_prefix: str = "##",
|
||||
):
|
||||
|
||||
if vocab_file is not None:
|
||||
tokenizer = Tokenizer(WordPiece(vocab_file, unk_token=str(unk_token)))
|
||||
if vocab is not None:
|
||||
tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(unk_token)))
|
||||
else:
|
||||
tokenizer = Tokenizer(WordPiece(unk_token=str(unk_token)))
|
||||
|
||||
@@ -51,7 +51,7 @@ class BertWordPieceTokenizer(BaseTokenizer):
|
||||
)
|
||||
tokenizer.pre_tokenizer = BertPreTokenizer()
|
||||
|
||||
if vocab_file is not None:
|
||||
if vocab is not None:
|
||||
sep_token_id = tokenizer.token_to_id(str(sep_token))
|
||||
if sep_token_id is None:
|
||||
raise TypeError("sep_token not found in the vocabulary")
|
||||
|
||||
@@ -1,21 +1,28 @@
|
||||
from tokenizers import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers, processors
|
||||
from tokenizers import (
|
||||
Tokenizer,
|
||||
AddedToken,
|
||||
pre_tokenizers,
|
||||
decoders,
|
||||
trainers,
|
||||
processors,
|
||||
)
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.normalizers import unicode_normalizer_from_str, Lowercase, Sequence
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional, List, Union
|
||||
from typing import Optional, List, Union, Dict, Tuple
|
||||
|
||||
|
||||
class ByteLevelBPETokenizer(BaseTokenizer):
|
||||
""" ByteLevelBPETokenizer
|
||||
"""ByteLevelBPETokenizer
|
||||
|
||||
Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file: Optional[str] = None,
|
||||
merges_file: Optional[str] = None,
|
||||
vocab: Optional[Union[str, Dict[str, int]]] = None,
|
||||
merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
|
||||
add_prefix_space: bool = False,
|
||||
lowercase: bool = False,
|
||||
dropout: Optional[float] = None,
|
||||
@@ -24,11 +31,11 @@ class ByteLevelBPETokenizer(BaseTokenizer):
|
||||
end_of_word_suffix: Optional[str] = None,
|
||||
trim_offsets: bool = False,
|
||||
):
|
||||
if vocab_file is not None and merges_file is not None:
|
||||
if vocab is not None and merges is not None:
|
||||
tokenizer = Tokenizer(
|
||||
BPE(
|
||||
vocab_file,
|
||||
merges_file,
|
||||
vocab,
|
||||
merges,
|
||||
dropout=dropout,
|
||||
continuing_subword_prefix=continuing_subword_prefix or "",
|
||||
end_of_word_suffix=end_of_word_suffix or "",
|
||||
|
||||
@@ -1,13 +1,18 @@
|
||||
from .. import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers
|
||||
from ..models import BPE
|
||||
from ..normalizers import Sequence, Lowercase, unicode_normalizer_from_str, BertNormalizer
|
||||
from ..normalizers import (
|
||||
Sequence,
|
||||
Lowercase,
|
||||
unicode_normalizer_from_str,
|
||||
BertNormalizer,
|
||||
)
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional, List, Union
|
||||
from typing import Optional, List, Union, Dict, Tuple
|
||||
|
||||
|
||||
class CharBPETokenizer(BaseTokenizer):
|
||||
""" Original BPE Tokenizer
|
||||
"""Original BPE Tokenizer
|
||||
|
||||
Represents the BPE algorithm, as introduced by Rico Sennrich
|
||||
(https://arxiv.org/abs/1508.07909)
|
||||
@@ -24,8 +29,8 @@ class CharBPETokenizer(BaseTokenizer):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file: Optional[str] = None,
|
||||
merges_file: Optional[str] = None,
|
||||
vocab: Optional[Union[str, Dict[str, int]]] = None,
|
||||
merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
|
||||
unk_token: Union[str, AddedToken] = "<unk>",
|
||||
suffix: str = "</w>",
|
||||
dropout: Optional[float] = None,
|
||||
@@ -34,11 +39,11 @@ class CharBPETokenizer(BaseTokenizer):
|
||||
bert_normalizer: bool = True,
|
||||
split_on_whitespace_only: bool = False,
|
||||
):
|
||||
if vocab_file is not None and merges_file is not None:
|
||||
if vocab is not None and merges is not None:
|
||||
tokenizer = Tokenizer(
|
||||
BPE(
|
||||
vocab_file,
|
||||
merges_file,
|
||||
vocab,
|
||||
merges,
|
||||
dropout=dropout,
|
||||
unk_token=str(unk_token),
|
||||
end_of_word_suffix=suffix,
|
||||
|
||||
@@ -3,28 +3,26 @@ from tokenizers.models import BPE
|
||||
from tokenizers.normalizers import NFKC
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional, List, Union
|
||||
from typing import Optional, List, Union, Dict, Tuple
|
||||
|
||||
|
||||
class SentencePieceBPETokenizer(BaseTokenizer):
|
||||
""" SentencePiece BPE Tokenizer
|
||||
"""SentencePiece BPE Tokenizer
|
||||
|
||||
Represents the BPE algorithm, with the pretokenization used by SentencePiece
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file: Optional[str] = None,
|
||||
merges_file: Optional[str] = None,
|
||||
vocab: Optional[Union[str, Dict[str, int]]] = None,
|
||||
merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
|
||||
unk_token: Union[str, AddedToken] = "<unk>",
|
||||
replacement: str = "▁",
|
||||
add_prefix_space: bool = True,
|
||||
dropout: Optional[float] = None,
|
||||
):
|
||||
if vocab_file is not None and merges_file is not None:
|
||||
tokenizer = Tokenizer(
|
||||
BPE(vocab_file, merges_file, dropout=dropout, unk_token=unk_token)
|
||||
)
|
||||
if vocab is not None and merges is not None:
|
||||
tokenizer = Tokenizer(BPE(vocab, merges, dropout=dropout, unk_token=unk_token))
|
||||
else:
|
||||
tokenizer = Tokenizer(BPE())
|
||||
|
||||
|
||||
@@ -92,19 +92,10 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
|
||||
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
|
||||
)
|
||||
|
||||
data = {"unk_id": unk_id, "vocab": vocab}
|
||||
|
||||
replacement = "▁"
|
||||
add_prefix_space = True
|
||||
|
||||
out_vocab_filename = f"{filename}.json"
|
||||
try:
|
||||
with open(out_vocab_filename, "w") as f:
|
||||
json.dump(data, f, indent=4)
|
||||
|
||||
tokenizer = Tokenizer(Unigram(out_vocab_filename))
|
||||
finally:
|
||||
os.remove(out_vocab_filename)
|
||||
tokenizer = Tokenizer(Unigram(vocab, unk_id))
|
||||
|
||||
tokenizer.normalizer = normalizers.Precompiled(precompiled_charsmap)
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from .. import Encoding, Offsets, Token
|
||||
from typing import List, Optional, Union, Tuple
|
||||
from typing import List, Optional, Union, Tuple, Dict
|
||||
|
||||
class Model:
|
||||
""" Base class for all models
|
||||
@@ -32,11 +32,15 @@ class BPE(Model):
|
||||
Instantiate a BPE Model from the given vocab and merges files.
|
||||
|
||||
Args:
|
||||
vocab: ('`optional`) string:
|
||||
Path to a vocabulary JSON file.
|
||||
vocab: ('`optional`) Dict[str, int]:
|
||||
A dictionnary of string keys and their ids {"am": 0,...}
|
||||
|
||||
merges: (`optional`) string:
|
||||
Path to a merge file.
|
||||
A dictionnary of pairs of ids as keys and their merge correspondace:
|
||||
{(id_left, id_right): (importance, id_merged), .... }
|
||||
with vocab : {"a": 0, "b": 1", ... "ab": 4} the merge
|
||||
{(0, 1): (0, 4) ,...}
|
||||
corresponds to the "ab" merge, that is the most likely merge (0)
|
||||
|
||||
cache_capacity: (`optional`) int:
|
||||
The number of words that the BPE cache can contain. The cache allows
|
||||
@@ -62,8 +66,8 @@ class BPE(Model):
|
||||
@staticmethod
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Optional[str],
|
||||
merges: Optional[str],
|
||||
vocab: Optional[Union[str, Dict[str, int]]],
|
||||
merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]],
|
||||
cache_capacity: Optional[int],
|
||||
dropout: Optional[float],
|
||||
unk_token: Optional[str],
|
||||
@@ -80,7 +84,7 @@ class WordPiece(Model):
|
||||
|
||||
Args:
|
||||
vocab: (`optional`) string:
|
||||
Path to a vocabulary file.
|
||||
A dictionnary of string keys and their ids {"am": 0,...}
|
||||
|
||||
unk_token: (`optional`) str:
|
||||
The unknown token to be used by the model.
|
||||
@@ -91,7 +95,7 @@ class WordPiece(Model):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Optional[str],
|
||||
vocab: Optional[Union[str, Dict[str, int]]],
|
||||
unk_token: Optional[str],
|
||||
max_input_chars_per_word: Optional[int],
|
||||
):
|
||||
@@ -105,13 +109,13 @@ class WordLevel(Model):
|
||||
|
||||
Args:
|
||||
vocab: (`optional`) string:
|
||||
Path to a vocabulary file.
|
||||
A dictionnary of string keys and their ids {"am": 0,...}
|
||||
|
||||
unk_token: str:
|
||||
The unknown token to be used by the model.
|
||||
"""
|
||||
|
||||
def __init__(self, vocab: Optional[str], unk_token: Optional[str]):
|
||||
def __init__(self, vocab: Optional[Union[str, Dict[str, int]]], unk_token: Optional[str]):
|
||||
pass
|
||||
|
||||
class Unigram(Model):
|
||||
@@ -121,10 +125,10 @@ class Unigram(Model):
|
||||
|
||||
Args:
|
||||
vocab: ('`optional`) string:
|
||||
Path to a vocabulary JSON file.
|
||||
A list of vocabulary items and their relative score [("am", -0.2442),...]
|
||||
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def __init__(self, vocab: Optional[str]):
|
||||
def __init__(self, vocab: Optional[List[Tuple[str, float]]]):
|
||||
pass
|
||||
|
||||
@@ -7,16 +7,25 @@ use pyo3::exceptions;
|
||||
use pyo3::prelude::*;
|
||||
use pyo3::types::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tk::models::bpe::BPE;
|
||||
use tk::models::bpe::{BpeBuilder, BPE};
|
||||
use tk::models::unigram::Unigram;
|
||||
use tk::models::wordlevel::WordLevel;
|
||||
use tk::models::wordpiece::WordPiece;
|
||||
use tk::models::wordpiece::{WordPiece, WordPieceBuilder};
|
||||
use tk::models::ModelWrapper;
|
||||
use tk::{Model, Token};
|
||||
use tokenizers as tk;
|
||||
|
||||
use super::error::ToPyResult;
|
||||
|
||||
fn deprecation_warning(version: &str, message: &str) -> PyResult<()> {
|
||||
let gil = pyo3::Python::acquire_gil();
|
||||
let python = gil.python();
|
||||
let deprecation_warning = python.import("builtins")?.get("DeprecationWarning")?;
|
||||
let full_message = format!("Deprecated in {}: {}", version, message);
|
||||
pyo3::PyErr::warn(python, deprecation_warning, &full_message, 0)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// A Model represents some tokenization algorithm like BPE or Word
|
||||
/// This class cannot be constructed directly. Please use one of the concrete models.
|
||||
#[pyclass(module = "tokenizers.models", name=Model)]
|
||||
@@ -137,25 +146,8 @@ impl PyModel {
|
||||
#[pyclass(extends=PyModel, module = "tokenizers.models", name=BPE)]
|
||||
pub struct PyBPE {}
|
||||
|
||||
#[pymethods]
|
||||
impl PyBPE {
|
||||
#[new]
|
||||
#[args(kwargs = "**")]
|
||||
fn new(
|
||||
vocab: Option<&str>,
|
||||
merges: Option<&str>,
|
||||
kwargs: Option<&PyDict>,
|
||||
) -> PyResult<(Self, PyModel)> {
|
||||
if (vocab.is_some() && merges.is_none()) || (vocab.is_none() && merges.is_some()) {
|
||||
return Err(exceptions::PyValueError::new_err(
|
||||
"`vocab` and `merges` must be both specified",
|
||||
));
|
||||
}
|
||||
|
||||
let mut builder = BPE::builder();
|
||||
if let (Some(vocab), Some(merges)) = (vocab, merges) {
|
||||
builder = builder.files(vocab.to_owned(), merges.to_owned());
|
||||
}
|
||||
fn with_builder(mut builder: BpeBuilder, kwargs: Option<&PyDict>) -> PyResult<(Self, PyModel)> {
|
||||
if let Some(kwargs) = kwargs {
|
||||
for (key, value) in kwargs {
|
||||
let key: &str = key.extract()?;
|
||||
@@ -191,21 +183,62 @@ impl PyBPE {
|
||||
}
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl PyBPE {
|
||||
#[new]
|
||||
#[args(kwargs = "**")]
|
||||
fn new(
|
||||
vocab: Option<&PyAny>,
|
||||
merges: Option<&PyAny>,
|
||||
kwargs: Option<&PyDict>,
|
||||
) -> PyResult<(Self, PyModel)> {
|
||||
if (vocab.is_some() && merges.is_none()) || (vocab.is_none() && merges.is_some()) {
|
||||
return Err(exceptions::PyValueError::new_err(
|
||||
"`vocab` and `merges` must be both specified",
|
||||
));
|
||||
}
|
||||
|
||||
let mut builder = BPE::builder();
|
||||
if let (Some(vocab_any), Some(merges_any)) = (vocab, merges) {
|
||||
if let (Ok(vocab), Ok(merges)) = (vocab_any.extract(), merges_any.extract()) {
|
||||
builder = builder.vocab_and_merges(vocab, merges);
|
||||
} else {
|
||||
let vocab_filename: String = vocab_any.extract()?;
|
||||
let merges_filename: String = merges_any.extract()?;
|
||||
deprecation_warning(
|
||||
"0.9.0",
|
||||
"BPE.__init__ will not create from files anymore, try `BPE.from_files` instead",
|
||||
)?;
|
||||
builder = builder.files(vocab_filename, merges_filename);
|
||||
}
|
||||
}
|
||||
|
||||
PyBPE::with_builder(builder, kwargs)
|
||||
}
|
||||
|
||||
#[staticmethod]
|
||||
#[args(kwargs = "**")]
|
||||
fn from_files(
|
||||
vocab_filename: String,
|
||||
merges_filename: String,
|
||||
kwargs: Option<&PyDict>,
|
||||
) -> PyResult<(Self, PyModel)> {
|
||||
let mut builder = BPE::builder();
|
||||
builder = builder.files(vocab_filename, merges_filename);
|
||||
|
||||
PyBPE::with_builder(builder, kwargs)
|
||||
}
|
||||
}
|
||||
|
||||
/// WordPiece Model
|
||||
#[pyclass(extends=PyModel, module = "tokenizers.models", name=WordPiece)]
|
||||
pub struct PyWordPiece {}
|
||||
|
||||
#[pymethods]
|
||||
impl PyWordPiece {
|
||||
#[new]
|
||||
#[args(kwargs = "**")]
|
||||
fn new(vocab: Option<&str>, kwargs: Option<&PyDict>) -> PyResult<(Self, PyModel)> {
|
||||
let mut builder = WordPiece::builder();
|
||||
|
||||
if let Some(vocab) = vocab {
|
||||
builder = builder.files(vocab.to_owned());
|
||||
}
|
||||
|
||||
fn with_builder(
|
||||
mut builder: WordPieceBuilder,
|
||||
kwargs: Option<&PyDict>,
|
||||
) -> PyResult<(Self, PyModel)> {
|
||||
if let Some(kwargs) = kwargs {
|
||||
for (key, val) in kwargs {
|
||||
let key: &str = key.extract()?;
|
||||
@@ -234,14 +267,43 @@ impl PyWordPiece {
|
||||
}
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl PyWordPiece {
|
||||
#[new]
|
||||
#[args(kwargs = "**")]
|
||||
fn new(vocab: Option<&PyAny>, kwargs: Option<&PyDict>) -> PyResult<(Self, PyModel)> {
|
||||
let mut builder = WordPiece::builder();
|
||||
|
||||
if let Some(vocab_any) = vocab {
|
||||
#[allow(deprecated)]
|
||||
if let Ok(vocab) = vocab_any.extract() {
|
||||
builder = builder.vocab(vocab);
|
||||
} else {
|
||||
deprecation_warning(
|
||||
"0.9.0",
|
||||
"WordPiece.__init__ will not create from files anymore, try `WordPiece.from_file` instead",
|
||||
)?;
|
||||
let vocab_filename: String = vocab_any.extract()?;
|
||||
builder = builder.files(vocab_filename);
|
||||
}
|
||||
}
|
||||
|
||||
PyWordPiece::with_builder(builder, kwargs)
|
||||
}
|
||||
|
||||
#[staticmethod]
|
||||
fn from_file(vocab: String, kwargs: Option<&PyDict>) -> PyResult<(Self, PyModel)> {
|
||||
let mut builder = WordPiece::builder();
|
||||
builder = builder.files(vocab);
|
||||
PyWordPiece::with_builder(builder, kwargs)
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=PyModel, module = "tokenizers.models", name=WordLevel)]
|
||||
pub struct PyWordLevel {}
|
||||
|
||||
#[pymethods]
|
||||
impl PyWordLevel {
|
||||
#[new]
|
||||
#[args(kwargs = "**")]
|
||||
fn new(vocab: Option<&str>, kwargs: Option<&PyDict>) -> PyResult<(Self, PyModel)> {
|
||||
fn get_unk(kwargs: Option<&PyDict>) -> PyResult<String> {
|
||||
let mut unk_token = String::from("<unk>");
|
||||
|
||||
if let Some(kwargs) = kwargs {
|
||||
@@ -253,15 +315,38 @@ impl PyWordLevel {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(vocab) = vocab {
|
||||
match WordLevel::from_files(vocab, unk_token) {
|
||||
Err(e) => Err(exceptions::PyException::new_err(format!(
|
||||
"Error while initializing WordLevel: {}",
|
||||
e
|
||||
))),
|
||||
Ok(model) => Ok((PyWordLevel {}, PyModel::new(Arc::new(model.into())))),
|
||||
Ok(unk_token)
|
||||
}
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl PyWordLevel {
|
||||
#[new]
|
||||
#[args(kwargs = "**")]
|
||||
fn new(vocab: Option<&PyAny>, kwargs: Option<&PyDict>) -> PyResult<(Self, PyModel)> {
|
||||
let unk_token = PyWordLevel::get_unk(kwargs)?;
|
||||
|
||||
if let Some(vocab_object) = vocab {
|
||||
let model = if let Ok(vocab) = vocab_object.extract() {
|
||||
WordLevel::builder()
|
||||
.vocab(vocab)
|
||||
.unk_token(unk_token)
|
||||
.build()
|
||||
} else {
|
||||
let filename: &str = vocab_object.extract()?;
|
||||
deprecation_warning(
|
||||
"0.9.0",
|
||||
"WordLevel.__init__ will not create from files anymore, try `WordLevel.from_file` instead",
|
||||
)?;
|
||||
WordLevel::from_files(filename, unk_token).map_err(|e| {
|
||||
exceptions::PyException::new_err(format!(
|
||||
"Error while loading WordLevel: {}",
|
||||
e
|
||||
))
|
||||
})?
|
||||
};
|
||||
|
||||
Ok((PyWordLevel {}, PyModel::new(Arc::new(model.into()))))
|
||||
} else {
|
||||
Ok((
|
||||
PyWordLevel {},
|
||||
@@ -269,6 +354,18 @@ impl PyWordLevel {
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
#[staticmethod]
|
||||
fn from_file(vocab_filename: &str, kwargs: Option<&PyDict>) -> PyResult<(Self, PyModel)> {
|
||||
let unk_token = PyWordLevel::get_unk(kwargs)?;
|
||||
let model = WordLevel::from_files(vocab_filename, unk_token).map_err(|e| {
|
||||
exceptions::PyException::new_err(format!(
|
||||
"Error while loading WordLevel from file: {}",
|
||||
e
|
||||
))
|
||||
})?;
|
||||
Ok((PyWordLevel {}, PyModel::new(Arc::new(model.into()))))
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=PyModel, module = "tokenizers.models", name=Unigram)]
|
||||
@@ -277,19 +374,22 @@ pub struct PyUnigram {}
|
||||
#[pymethods]
|
||||
impl PyUnigram {
|
||||
#[new]
|
||||
fn new(vocab: Option<&str>) -> PyResult<(Self, PyModel)> {
|
||||
match vocab {
|
||||
Some(vocab) => match Unigram::load(vocab) {
|
||||
Err(e) => Err(exceptions::PyException::new_err(format!(
|
||||
"Error while loading Unigram: {}",
|
||||
e
|
||||
))),
|
||||
Ok(model) => Ok((PyUnigram {}, PyModel::new(Arc::new(model.into())))),
|
||||
},
|
||||
None => Ok((
|
||||
fn new(vocab: Option<Vec<(String, f64)>>, unk_id: Option<usize>) -> PyResult<(Self, PyModel)> {
|
||||
if vocab.is_some() && unk_id.is_none() || vocab.is_none() && unk_id.is_some() {}
|
||||
match (vocab, unk_id) {
|
||||
(Some(vocab), Some(unk_id)) => {
|
||||
let model = Unigram::from(vocab, unk_id).map_err(|e| {
|
||||
exceptions::PyException::new_err(format!("Error while loading Unigram: {}", e))
|
||||
})?;
|
||||
Ok((PyUnigram {}, PyModel::new(Arc::new(model.into()))))
|
||||
}
|
||||
(None, None) => Ok((
|
||||
PyUnigram {},
|
||||
PyModel::new(Arc::new(Unigram::default().into())),
|
||||
)),
|
||||
_ => Err(exceptions::PyValueError::new_err(
|
||||
"`vocab` and `unk_id` must be both specified",
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,12 +10,27 @@ class TestBPE:
|
||||
def test_instantiate(self, roberta_files):
|
||||
assert isinstance(BPE(), Model)
|
||||
assert isinstance(BPE(), BPE)
|
||||
|
||||
vocab = {"a": 0, "b": 1, "ab": 2}
|
||||
merges = {(0, 1): (0, 2)}
|
||||
assert isinstance(BPE(vocab, merges), Model)
|
||||
with pytest.raises(ValueError, match="`vocab` and `merges` must be both specified"):
|
||||
BPE(vocab=vocab)
|
||||
BPE(merges=merges)
|
||||
|
||||
assert isinstance(pickle.loads(pickle.dumps(BPE(vocab, merges))), BPE,)
|
||||
|
||||
# Deprecated calls in 0.9
|
||||
with pytest.deprecated_call():
|
||||
assert isinstance(BPE(roberta_files["vocab"], roberta_files["merges"]), Model)
|
||||
|
||||
with pytest.raises(ValueError, match="`vocab` and `merges` must be both specified"):
|
||||
BPE(vocab=roberta_files["vocab"])
|
||||
BPE(merges=roberta_files["merges"])
|
||||
with pytest.deprecated_call():
|
||||
assert isinstance(
|
||||
pickle.loads(pickle.dumps(BPE(roberta_files["vocab"], roberta_files["merges"]))), BPE
|
||||
pickle.loads(pickle.dumps(BPE(roberta_files["vocab"], roberta_files["merges"]))),
|
||||
BPE,
|
||||
)
|
||||
|
||||
|
||||
@@ -23,7 +38,16 @@ class TestWordPiece:
|
||||
def test_instantiate(self, bert_files):
|
||||
assert isinstance(WordPiece(), Model)
|
||||
assert isinstance(WordPiece(), WordPiece)
|
||||
|
||||
vocab = {"a": 0, "b": 1, "ab": 2}
|
||||
assert isinstance(WordPiece(vocab), Model)
|
||||
assert isinstance(WordPiece(vocab), WordPiece)
|
||||
assert isinstance(pickle.loads(pickle.dumps(WordPiece(vocab))), WordPiece)
|
||||
|
||||
# Deprecated calls in 0.9
|
||||
with pytest.deprecated_call():
|
||||
assert isinstance(WordPiece(bert_files["vocab"]), Model)
|
||||
with pytest.deprecated_call():
|
||||
assert isinstance(pickle.loads(pickle.dumps(WordPiece(bert_files["vocab"]))), WordPiece)
|
||||
|
||||
|
||||
@@ -31,7 +55,14 @@ class TestWordLevel:
|
||||
def test_instantiate(self, roberta_files):
|
||||
assert isinstance(WordLevel(), Model)
|
||||
assert isinstance(WordLevel(), WordLevel)
|
||||
|
||||
vocab = {"a": 0, "b": 1, "ab": 2}
|
||||
assert isinstance(WordLevel(vocab), Model)
|
||||
assert isinstance(WordLevel(vocab), WordLevel)
|
||||
|
||||
# The WordLevel model expects a vocab.json using the same format as roberta
|
||||
# so we can just try to load with this file
|
||||
with pytest.deprecated_call():
|
||||
assert isinstance(WordLevel(roberta_files["vocab"]), Model)
|
||||
with pytest.deprecated_call():
|
||||
assert isinstance(WordLevel(roberta_files["vocab"]), WordLevel)
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import pytest
|
||||
import pickle
|
||||
|
||||
from ..utils import data_dir, roberta_files
|
||||
@@ -21,7 +22,7 @@ class TestBertProcessing:
|
||||
assert isinstance(processor, PostProcessor)
|
||||
assert isinstance(processor, BertProcessing)
|
||||
assert isinstance(
|
||||
pickle.loads(pickle.dumps(BertProcessing(("[SEP]", 0), ("[CLS]", 1)))), BertProcessing
|
||||
pickle.loads(pickle.dumps(BertProcessing(("[SEP]", 0), ("[CLS]", 1)))), BertProcessing,
|
||||
)
|
||||
|
||||
def test_processing(self):
|
||||
@@ -66,6 +67,8 @@ class TestByteLevelProcessing:
|
||||
assert isinstance(pickle.loads(pickle.dumps(ByteLevel())), ByteLevel)
|
||||
|
||||
def test_processing(self, roberta_files):
|
||||
# Deprecated in 0.9
|
||||
with pytest.deprecated_call():
|
||||
tokenizer = Tokenizer(BPE(roberta_files["vocab"], roberta_files["merges"]))
|
||||
tokenizer.pre_tokenizer = ByteLevelPreTokenizer(add_prefix_space=True)
|
||||
|
||||
|
||||
@@ -1,7 +1,12 @@
|
||||
import numpy as np
|
||||
import pickle
|
||||
import pytest
|
||||
from ..utils import data_dir, roberta_files, bert_files, multiprocessing_with_parallelism
|
||||
from ..utils import (
|
||||
data_dir,
|
||||
roberta_files,
|
||||
bert_files,
|
||||
multiprocessing_with_parallelism,
|
||||
)
|
||||
|
||||
from tokenizers import AddedToken, Tokenizer, Encoding
|
||||
from tokenizers.models import Model, BPE, WordPiece
|
||||
@@ -88,7 +93,11 @@ class TestTokenizer:
|
||||
added = tokenizer.add_tokens(["my", "name", "is", "john"])
|
||||
assert added == 4
|
||||
|
||||
tokens = [AddedToken("the"), AddedToken("quick", normalized=False), AddedToken()]
|
||||
tokens = [
|
||||
AddedToken("the"),
|
||||
AddedToken("quick", normalized=False),
|
||||
AddedToken(),
|
||||
]
|
||||
assert tokens[0].normalized == True
|
||||
added = tokenizer.add_tokens(tokens)
|
||||
assert added == 2
|
||||
@@ -139,17 +148,36 @@ class TestTokenizer:
|
||||
assert len(output) == 2
|
||||
|
||||
def test_encode_formats(self, bert_files):
|
||||
with pytest.deprecated_call():
|
||||
tokenizer = BertWordPieceTokenizer(bert_files["vocab"])
|
||||
|
||||
# Encode
|
||||
output = tokenizer.encode("my name is john")
|
||||
assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"]
|
||||
output = tokenizer.encode("my name is john", "pair")
|
||||
assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]
|
||||
assert output.tokens == [
|
||||
"[CLS]",
|
||||
"my",
|
||||
"name",
|
||||
"is",
|
||||
"john",
|
||||
"[SEP]",
|
||||
"pair",
|
||||
"[SEP]",
|
||||
]
|
||||
output = tokenizer.encode(["my", "name", "is", "john"], is_pretokenized=True)
|
||||
assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"]
|
||||
output = tokenizer.encode(["my", "name", "is", "john"], ["pair"], is_pretokenized=True)
|
||||
assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]
|
||||
assert output.tokens == [
|
||||
"[CLS]",
|
||||
"my",
|
||||
"name",
|
||||
"is",
|
||||
"john",
|
||||
"[SEP]",
|
||||
"pair",
|
||||
"[SEP]",
|
||||
]
|
||||
|
||||
# Encode batch
|
||||
result_single = [
|
||||
@@ -193,11 +221,17 @@ class TestTokenizer:
|
||||
# Lists
|
||||
test_single([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]], True)
|
||||
test_pair(
|
||||
[(["My", "name", "is", "John"], ["pair"]), (["My", "name", "is", "Georges"], ["pair"])],
|
||||
[
|
||||
(["My", "name", "is", "John"], ["pair"]),
|
||||
(["My", "name", "is", "Georges"], ["pair"]),
|
||||
],
|
||||
True,
|
||||
)
|
||||
test_pair(
|
||||
[[["My", "name", "is", "John"], ["pair"]], [["My", "name", "is", "Georges"], ["pair"]]],
|
||||
[
|
||||
[["My", "name", "is", "John"], ["pair"]],
|
||||
[["My", "name", "is", "Georges"], ["pair"]],
|
||||
],
|
||||
True,
|
||||
)
|
||||
|
||||
@@ -211,19 +245,27 @@ class TestTokenizer:
|
||||
True,
|
||||
)
|
||||
test_pair(
|
||||
((["My", "name", "is", "John"], ["pair"]), (["My", "name", "is", "Georges"], ["pair"])),
|
||||
(
|
||||
(["My", "name", "is", "John"], ["pair"]),
|
||||
(["My", "name", "is", "Georges"], ["pair"]),
|
||||
),
|
||||
True,
|
||||
)
|
||||
|
||||
# Numpy
|
||||
test_single(np.array([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]]), True)
|
||||
test_single(np.array((("My", "name", "is", "John"), ("My", "name", "is", "Georges"))), True)
|
||||
test_single(
|
||||
np.array([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]]), True,
|
||||
)
|
||||
test_single(
|
||||
np.array((("My", "name", "is", "John"), ("My", "name", "is", "Georges"))), True,
|
||||
)
|
||||
test_pair(
|
||||
np.array(
|
||||
[
|
||||
[["My", "name", "is", "John"], ["pair"]],
|
||||
[["My", "name", "is", "Georges"], ["pair"]],
|
||||
]
|
||||
],
|
||||
dtype=object,
|
||||
),
|
||||
True,
|
||||
)
|
||||
@@ -232,7 +274,8 @@ class TestTokenizer:
|
||||
(
|
||||
(("My", "name", "is", "John"), ("pair",)),
|
||||
(("My", "name", "is", "Georges"), ("pair",)),
|
||||
)
|
||||
),
|
||||
dtype=object,
|
||||
),
|
||||
True,
|
||||
)
|
||||
@@ -249,6 +292,7 @@ class TestTokenizer:
|
||||
tokenizer.encode(["My", "name", "is", "John"], "pair", is_pretokenized=True)
|
||||
|
||||
def test_encode_add_special_tokens(self, roberta_files):
|
||||
with pytest.deprecated_call():
|
||||
tokenizer = Tokenizer(BPE(roberta_files["vocab"], roberta_files["merges"]))
|
||||
tokenizer.add_special_tokens(["<s>", "</s>"])
|
||||
|
||||
@@ -259,7 +303,14 @@ class TestTokenizer:
|
||||
|
||||
# Can encode with special tokens
|
||||
output_with_specials = tokenizer.encode("My name is John", add_special_tokens=True)
|
||||
assert output_with_specials.tokens == ["<s>", "ĠMy", "Ġname", "Ġis", "ĠJohn", "</s>"]
|
||||
assert output_with_specials.tokens == [
|
||||
"<s>",
|
||||
"ĠMy",
|
||||
"Ġname",
|
||||
"Ġis",
|
||||
"ĠJohn",
|
||||
"</s>",
|
||||
]
|
||||
|
||||
# Can encode without special tokens
|
||||
output_without_specials = tokenizer.encode("My name is John", add_special_tokens=False)
|
||||
|
||||
@@ -1,16 +1,36 @@
|
||||
import pytest
|
||||
|
||||
from ..utils import data_dir, bert_files, multiprocessing_with_parallelism
|
||||
from tokenizers import BertWordPieceTokenizer
|
||||
|
||||
|
||||
class TestBertWordPieceBPE:
|
||||
def test_basic_encode(self, bert_files):
|
||||
tokenizer = BertWordPieceTokenizer(bert_files["vocab"])
|
||||
tokenizer = BertWordPieceTokenizer.from_file(bert_files["vocab"])
|
||||
|
||||
# Encode with special tokens by default
|
||||
output = tokenizer.encode("My name is John", "pair")
|
||||
assert output.ids == [101, 2026, 2171, 2003, 2198, 102, 3940, 102]
|
||||
assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]
|
||||
assert output.offsets == [(0, 0), (0, 2), (3, 7), (8, 10), (11, 15), (0, 0), (0, 4), (0, 0)]
|
||||
assert output.tokens == [
|
||||
"[CLS]",
|
||||
"my",
|
||||
"name",
|
||||
"is",
|
||||
"john",
|
||||
"[SEP]",
|
||||
"pair",
|
||||
"[SEP]",
|
||||
]
|
||||
assert output.offsets == [
|
||||
(0, 0),
|
||||
(0, 2),
|
||||
(3, 7),
|
||||
(8, 10),
|
||||
(11, 15),
|
||||
(0, 0),
|
||||
(0, 4),
|
||||
(0, 0),
|
||||
]
|
||||
assert output.type_ids == [0, 0, 0, 0, 0, 0, 1, 1]
|
||||
|
||||
# Can encode without the special tokens
|
||||
@@ -21,6 +41,6 @@ class TestBertWordPieceBPE:
|
||||
assert output.type_ids == [0, 0, 0, 0, 1]
|
||||
|
||||
def test_multiprocessing_with_parallelism(self, bert_files):
|
||||
tokenizer = BertWordPieceTokenizer(bert_files["vocab"])
|
||||
tokenizer = BertWordPieceTokenizer.from_file(bert_files["vocab"])
|
||||
multiprocessing_with_parallelism(tokenizer, False)
|
||||
multiprocessing_with_parallelism(tokenizer, True)
|
||||
|
||||
@@ -1,10 +1,14 @@
|
||||
import pytest
|
||||
|
||||
from ..utils import data_dir, roberta_files, multiprocessing_with_parallelism
|
||||
from tokenizers import ByteLevelBPETokenizer
|
||||
|
||||
|
||||
class TestByteLevelBPE:
|
||||
def test_basic_encode(self, roberta_files):
|
||||
tokenizer = ByteLevelBPETokenizer(roberta_files["vocab"], roberta_files["merges"])
|
||||
tokenizer = ByteLevelBPETokenizer.from_files(
|
||||
roberta_files["vocab"], roberta_files["merges"]
|
||||
)
|
||||
output = tokenizer.encode("The quick brown fox jumps over the lazy dog")
|
||||
|
||||
assert output.ids == [133, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335]
|
||||
@@ -32,7 +36,7 @@ class TestByteLevelBPE:
|
||||
]
|
||||
|
||||
def test_add_prefix_space(self, roberta_files):
|
||||
tokenizer = ByteLevelBPETokenizer(
|
||||
tokenizer = ByteLevelBPETokenizer.from_files(
|
||||
roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True
|
||||
)
|
||||
output = tokenizer.encode("The quick brown fox jumps over the lazy dog")
|
||||
@@ -62,8 +66,8 @@ class TestByteLevelBPE:
|
||||
]
|
||||
|
||||
def test_lowerspace(self, roberta_files):
|
||||
tokenizer = ByteLevelBPETokenizer(
|
||||
roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True
|
||||
tokenizer = ByteLevelBPETokenizer.from_files(
|
||||
roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True,
|
||||
)
|
||||
output = tokenizer.encode("The Quick Brown Fox Jumps Over The Lazy Dog")
|
||||
|
||||
@@ -81,6 +85,8 @@ class TestByteLevelBPE:
|
||||
]
|
||||
|
||||
def test_multiprocessing_with_parallelism(self, roberta_files):
|
||||
tokenizer = ByteLevelBPETokenizer(roberta_files["vocab"], roberta_files["merges"])
|
||||
tokenizer = ByteLevelBPETokenizer.from_files(
|
||||
roberta_files["vocab"], roberta_files["merges"]
|
||||
)
|
||||
multiprocessing_with_parallelism(tokenizer, False)
|
||||
multiprocessing_with_parallelism(tokenizer, True)
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
import pytest
|
||||
|
||||
from ..utils import data_dir, openai_files, multiprocessing_with_parallelism
|
||||
from tokenizers import CharBPETokenizer
|
||||
|
||||
|
||||
class TestBertWordPieceBPE:
|
||||
def test_basic_encode(self, openai_files):
|
||||
tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"])
|
||||
tokenizer = CharBPETokenizer.from_files(openai_files["vocab"], openai_files["merges"])
|
||||
|
||||
output = tokenizer.encode("My name is John", "pair")
|
||||
assert output.ids == [0, 253, 1362, 544, 0, 7, 12662, 2688]
|
||||
@@ -31,7 +33,9 @@ class TestBertWordPieceBPE:
|
||||
assert output.type_ids == [0, 0, 0, 0, 0, 0, 0, 1]
|
||||
|
||||
def test_lowercase(self, openai_files):
|
||||
tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"], lowercase=True)
|
||||
tokenizer = CharBPETokenizer.from_files(
|
||||
openai_files["vocab"], openai_files["merges"], lowercase=True
|
||||
)
|
||||
output = tokenizer.encode("My name is John", "pair", add_special_tokens=False)
|
||||
assert output.ids == [547, 1362, 544, 2476, 2688]
|
||||
assert output.tokens == ["my</w>", "name</w>", "is</w>", "john</w>", "pair</w>"]
|
||||
@@ -39,11 +43,13 @@ class TestBertWordPieceBPE:
|
||||
assert output.type_ids == [0, 0, 0, 0, 1]
|
||||
|
||||
def test_decoding(self, openai_files):
|
||||
tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"], lowercase=True)
|
||||
tokenizer = CharBPETokenizer.from_files(
|
||||
openai_files["vocab"], openai_files["merges"], lowercase=True
|
||||
)
|
||||
decoded = tokenizer.decode(tokenizer.encode("my name is john").ids)
|
||||
assert decoded == "my name is john"
|
||||
|
||||
def test_multiprocessing_with_parallelism(self, openai_files):
|
||||
tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"])
|
||||
tokenizer = CharBPETokenizer.from_files(openai_files["vocab"], openai_files["merges"])
|
||||
multiprocessing_with_parallelism(tokenizer, False)
|
||||
multiprocessing_with_parallelism(tokenizer, True)
|
||||
|
||||
@@ -101,7 +101,7 @@ impl std::fmt::Debug for WordLevel {
|
||||
}
|
||||
|
||||
impl WordLevel {
|
||||
fn builder() -> WordLevelBuilder {
|
||||
pub fn builder() -> WordLevelBuilder {
|
||||
WordLevelBuilder::new()
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user