mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Add unigram bytefallback (#1217)
* current updates will go red * cargo fmt * npm install * refactor train for unigram to allow bytefallbakc (breaking) * fmt * nits * update * add a proper test * fix encode optimised fallback + add trainer arg * fixes * fixes * fix tests * add test * fmt * fix rust test * update python bindings * update * pub is okay and needed * more fix * cleanup * remove useles id * MissingUnkId error * nits * fix offset * add a test in python * update src bindings * remove bytefallback from trainer * styling * update pckg * lint * fmt * stup with dev * update code based on review * remove unused function * udpate python test to compare ids * fix option bool issues * final fix * clippy * fix npm isntall * update * update test * more in depth testing * Lint * last attempt to fix node * update node bindings * fmt * Update tokenizers/src/models/unigram/model.rs Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com> * update based on review * simpler test * lint --------- Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
5
bindings/node/lib/bindings/models.d.ts
vendored
5
bindings/node/lib/bindings/models.d.ts
vendored
@ -170,6 +170,11 @@ export interface UnigramOptions {
|
||||
* @default undefined
|
||||
*/
|
||||
unkId?: number;
|
||||
/**
|
||||
* Whether or not bytefallback support should be enabled.
|
||||
* @default false
|
||||
*/
|
||||
byte_fallback?: boolean;
|
||||
}
|
||||
|
||||
export namespace Unigram {
|
||||
|
@ -124,6 +124,7 @@ describe("Unigram", () => {
|
||||
],
|
||||
{
|
||||
unkId: 0,
|
||||
byte_fallback: false,
|
||||
}
|
||||
);
|
||||
expect(unigram.constructor.name).toEqual("Model");
|
||||
|
@ -191,6 +191,7 @@ fn bpe_init(mut cx: FunctionContext) -> JsResult<JsModel> {
|
||||
/// unkToken?: string,
|
||||
/// continuingSubwordPrefix?: string,
|
||||
/// endOfWordSuffix?: string
|
||||
/// byteFallback?: bool
|
||||
/// }, callback)
|
||||
fn bpe_from_file(mut cx: FunctionContext) -> JsResult<JsUndefined> {
|
||||
let (options, callback) = match cx.extract_opt::<BpeOptions>(2) {
|
||||
@ -369,16 +370,16 @@ fn wordlevel_empty(mut cx: FunctionContext) -> JsResult<JsModel> {
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct UnigramOptions {
|
||||
unk_id: Option<usize>,
|
||||
byte_fallback: Option<bool>,
|
||||
}
|
||||
|
||||
/// unigram_init(vocab: [string, number][], options?: {
|
||||
/// unkId?: number
|
||||
/// })
|
||||
fn unigram_init(mut cx: FunctionContext) -> JsResult<JsModel> {
|
||||
let vocab = cx.extract::<Vec<(String, f64)>>(0)?;
|
||||
let options = cx.extract_opt::<UnigramOptions>(1)?.unwrap_or_default();
|
||||
|
||||
let unigram = tk::models::unigram::Unigram::from(vocab, options.unk_id)
|
||||
let byte_fallback = options.byte_fallback.unwrap_or(false);
|
||||
let unigram = tk::models::unigram::Unigram::from(vocab, options.unk_id, byte_fallback)
|
||||
.map_err(|e| Error(e.to_string()))?;
|
||||
|
||||
let mut js_model = JsModel::new::<_, JsModel, _>(&mut cx, vec![])?;
|
||||
|
13620
bindings/node/package-lock.json
generated
13620
bindings/node/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -16,7 +16,9 @@
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"@types/node": "^13.13.52",
|
||||
"node-pre-gyp": "^0.14.0"
|
||||
"native": "^0.3.3",
|
||||
"node-pre-gyp": "^0.14.0",
|
||||
"package.json": "^2.0.1"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/jest": "^26.0.24",
|
||||
|
Reference in New Issue
Block a user