mirror of
https://github.com/mii443/tokenizers.git
synced 2025-09-04 00:09:34 +00:00
Node - Merge encodings
This commit is contained in:
committed by
Anthony MOI
parent
4341c79d85
commit
0408567f23
12
bindings/node/lib/bindings/utils.d.ts
vendored
12
bindings/node/lib/bindings/utils.d.ts
vendored
@ -1,3 +1,5 @@
|
|||||||
|
import { RawEncoding } from "./raw-encoding";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a subpart of a string according to specified indexes, and respecting unicode characters
|
* Returns a subpart of a string according to specified indexes, and respecting unicode characters
|
||||||
*
|
*
|
||||||
@ -10,3 +12,13 @@
|
|||||||
* @since 0.6.0
|
* @since 0.6.0
|
||||||
*/
|
*/
|
||||||
export function slice(text: string, start?: number, end?: number): string;
|
export function slice(text: string, start?: number, end?: number): string;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Merge the list of RawEncoding into one final RawEncoding
|
||||||
|
* @param encodings The list of encodings to merge
|
||||||
|
* @param [growingOffsets=false] Whether the offsets should accumulate while merging
|
||||||
|
*/
|
||||||
|
export function mergeEncodings(
|
||||||
|
encodings: RawEncoding[],
|
||||||
|
growingOffsets?: boolean
|
||||||
|
): RawEncoding;
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
const native = require("./native");
|
const native = require("./native");
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
|
mergeEncodings: native.utils_mergeEncodings,
|
||||||
slice: native.utils_slice
|
slice: native.utils_slice
|
||||||
};
|
};
|
||||||
|
@ -1,4 +1,9 @@
|
|||||||
import { slice } from "./utils";
|
import { promisify } from "util";
|
||||||
|
|
||||||
|
import { BPE } from "./models";
|
||||||
|
import { RawEncoding } from "./raw-encoding";
|
||||||
|
import { Tokenizer } from "./tokenizer";
|
||||||
|
import { mergeEncodings, slice } from "./utils";
|
||||||
|
|
||||||
describe("slice", () => {
|
describe("slice", () => {
|
||||||
const text = "My name is John 👋";
|
const text = "My name is John 👋";
|
||||||
@ -105,3 +110,66 @@ describe("slice", () => {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("mergeEncodings", () => {
|
||||||
|
let encode: (
|
||||||
|
sequence: string,
|
||||||
|
pair: string | null,
|
||||||
|
addSpecialTokens: boolean
|
||||||
|
) => Promise<RawEncoding>;
|
||||||
|
|
||||||
|
beforeEach(async () => {
|
||||||
|
const model = BPE.empty();
|
||||||
|
const tokenizer = new Tokenizer(model);
|
||||||
|
tokenizer.addTokens(["my", "name", "is", "john"]);
|
||||||
|
|
||||||
|
encode = promisify(tokenizer.encode.bind(tokenizer));
|
||||||
|
});
|
||||||
|
|
||||||
|
it("accepts `undefined` as a second parameter", () => {
|
||||||
|
const encoding = mergeEncodings([], undefined);
|
||||||
|
expect(encoding.constructor.name).toEqual("Encoding");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns correct result with `growingOffsets` not provided", async () => {
|
||||||
|
const firstEncoding = await encode("my name is", null, false);
|
||||||
|
const secondEncoding = await encode("john", null, false);
|
||||||
|
const encoding = mergeEncodings([firstEncoding, secondEncoding]);
|
||||||
|
|
||||||
|
expect(encoding.getTokens()).toEqual(["my", "name", "is", "john"]);
|
||||||
|
expect(encoding.getOffsets()).toEqual([
|
||||||
|
[0, 2],
|
||||||
|
[3, 7],
|
||||||
|
[8, 10],
|
||||||
|
[0, 4]
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns correct result when `growingOffsets` is `false`", async () => {
|
||||||
|
const firstEncoding = await encode("my name is", null, false);
|
||||||
|
const secondEncoding = await encode("john", null, false);
|
||||||
|
const encoding = mergeEncodings([firstEncoding, secondEncoding], false);
|
||||||
|
|
||||||
|
expect(encoding.getTokens()).toEqual(["my", "name", "is", "john"]);
|
||||||
|
expect(encoding.getOffsets()).toEqual([
|
||||||
|
[0, 2],
|
||||||
|
[3, 7],
|
||||||
|
[8, 10],
|
||||||
|
[0, 4]
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns correct result when `growingOffsets` is `true`", async () => {
|
||||||
|
const firstEncoding = await encode("my name is", null, false);
|
||||||
|
const secondEncoding = await encode("john", null, false);
|
||||||
|
const encoding = mergeEncodings([firstEncoding, secondEncoding], true);
|
||||||
|
|
||||||
|
expect(encoding.getTokens()).toEqual(["my", "name", "is", "john"]);
|
||||||
|
expect(encoding.getOffsets()).toEqual([
|
||||||
|
[0, 2],
|
||||||
|
[3, 7],
|
||||||
|
[8, 10],
|
||||||
|
[10, 14]
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import { PaddingOptions, RawEncoding } from "../bindings/raw-encoding";
|
import { PaddingOptions, RawEncoding } from "../bindings/raw-encoding";
|
||||||
|
import { mergeEncodings } from "../bindings/utils";
|
||||||
|
|
||||||
export class Encoding {
|
export class Encoding {
|
||||||
private _attentionMask?: number[];
|
private _attentionMask?: number[];
|
||||||
@ -13,6 +14,20 @@ export class Encoding {
|
|||||||
|
|
||||||
constructor(private rawEncoding: RawEncoding) {}
|
constructor(private rawEncoding: RawEncoding) {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Merge a list of Encoding into one final Encoding
|
||||||
|
* @param encodings The list of encodings to merge
|
||||||
|
* @param [growingOffsets=false] Whether the offsets should accumulate while merging
|
||||||
|
*/
|
||||||
|
static merge(encodings: Encoding[], growingOffsets?: boolean): Encoding {
|
||||||
|
const mergedRaw = mergeEncodings(
|
||||||
|
encodings.map(e => e.rawEncoding),
|
||||||
|
growingOffsets
|
||||||
|
);
|
||||||
|
|
||||||
|
return new Encoding(mergedRaw);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Attention mask
|
* Attention mask
|
||||||
*/
|
*/
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
import { promisify } from "util";
|
import { promisify } from "util";
|
||||||
|
|
||||||
import { RawEncoding } from "../../bindings/raw-encoding";
|
|
||||||
import {
|
import {
|
||||||
AddedToken,
|
AddedToken,
|
||||||
PaddingConfiguration,
|
PaddingConfiguration,
|
||||||
|
@ -64,10 +64,16 @@ fn merge_encodings(mut cx: FunctionContext) -> JsResult<JsEncoding> {
|
|||||||
})
|
})
|
||||||
.collect::<Result<Vec<_>, neon::result::Throw>>()
|
.collect::<Result<Vec<_>, neon::result::Throw>>()
|
||||||
.map_err(|e| cx.throw_error::<_, ()>(format!("{}", e)).unwrap_err())?;
|
.map_err(|e| cx.throw_error::<_, ()>(format!("{}", e)).unwrap_err())?;
|
||||||
let growing_offsets = cx
|
|
||||||
.argument_opt(1)
|
let growing_offsets = if let Some(arg) = cx.argument_opt(1) {
|
||||||
.map(|arg| Ok(arg.downcast::<JsBoolean>().or_throw(&mut cx)?.value()))
|
if arg.downcast::<JsUndefined>().is_err() {
|
||||||
.unwrap_or(Ok(false))?;
|
arg.downcast::<JsBoolean>().or_throw(&mut cx)?.value()
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
};
|
||||||
|
|
||||||
let new_encoding = tk::tokenizer::Encoding::merge(encodings.as_slice(), growing_offsets);
|
let new_encoding = tk::tokenizer::Encoding::merge(encodings.as_slice(), growing_offsets);
|
||||||
let mut js_encoding = JsEncoding::new::<_, JsEncoding, _>(&mut cx, vec![])?;
|
let mut js_encoding = JsEncoding::new::<_, JsEncoding, _>(&mut cx, vec![])?;
|
||||||
|
Reference in New Issue
Block a user