Add unigram bytefallback (#1217)

* current updates will go red

* cargo fmt

* npm install

* refactor train for unigram to allow bytefallbakc (breaking)

* fmt

* nits

* update

* add a proper test

* fix encode optimised fallback + add trainer arg

* fixes

* fixes

* fix tests

* add test

* fmt

* fix rust test

* update python bindings

* update

* pub is okay and needed

* more fix

* cleanup

* remove useles id

* MissingUnkId error

* nits

* fix offset

* add a test in python

* update src bindings

* remove bytefallback from trainer

* styling

* update pckg

* lint

* fmt

* stup with dev

* update code based on review

* remove unused function

* udpate python test to compare ids

* fix option bool issues

* final fix

* clippy

* fix npm isntall

* update

* update test

* more in depth testing

* Lint

* last attempt to fix node

* update node bindings

* fmt

* Update tokenizers/src/models/unigram/model.rs

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>

* update based on review

* simpler test

* lint

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
Arthur
2023-06-26 17:46:59 +09:00
committed by GitHub
parent 8c9cfb0b68
commit 864135bef1
12 changed files with 7276 additions and 6540 deletions

View File

@ -170,6 +170,11 @@ export interface UnigramOptions {
* @default undefined
*/
unkId?: number;
/**
* Whether or not bytefallback support should be enabled.
* @default false
*/
byte_fallback?: boolean;
}
export namespace Unigram {

View File

@ -124,6 +124,7 @@ describe("Unigram", () => {
],
{
unkId: 0,
byte_fallback: false,
}
);
expect(unigram.constructor.name).toEqual("Model");

View File

@ -191,6 +191,7 @@ fn bpe_init(mut cx: FunctionContext) -> JsResult<JsModel> {
/// unkToken?: string,
/// continuingSubwordPrefix?: string,
/// endOfWordSuffix?: string
/// byteFallback?: bool
/// }, callback)
fn bpe_from_file(mut cx: FunctionContext) -> JsResult<JsUndefined> {
let (options, callback) = match cx.extract_opt::<BpeOptions>(2) {
@ -369,16 +370,16 @@ fn wordlevel_empty(mut cx: FunctionContext) -> JsResult<JsModel> {
#[serde(rename_all = "camelCase")]
struct UnigramOptions {
unk_id: Option<usize>,
byte_fallback: Option<bool>,
}
/// unigram_init(vocab: [string, number][], options?: {
/// unkId?: number
/// })
fn unigram_init(mut cx: FunctionContext) -> JsResult<JsModel> {
let vocab = cx.extract::<Vec<(String, f64)>>(0)?;
let options = cx.extract_opt::<UnigramOptions>(1)?.unwrap_or_default();
let unigram = tk::models::unigram::Unigram::from(vocab, options.unk_id)
let byte_fallback = options.byte_fallback.unwrap_or(false);
let unigram = tk::models::unigram::Unigram::from(vocab, options.unk_id, byte_fallback)
.map_err(|e| Error(e.to_string()))?;
let mut js_model = JsModel::new::<_, JsModel, _>(&mut cx, vec![])?;

File diff suppressed because it is too large Load Diff

View File

@ -16,7 +16,9 @@
"license": "Apache-2.0",
"dependencies": {
"@types/node": "^13.13.52",
"node-pre-gyp": "^0.14.0"
"native": "^0.3.3",
"node-pre-gyp": "^0.14.0",
"package.json": "^2.0.1"
},
"devDependencies": {
"@types/jest": "^26.0.24",