Files
tokenizers/bindings/node/lib/bindings/decoders.test.ts
Nicolas Patry 73637a0004 Adding ByteFallback support for tokenizers. (#1183)
* Adding ByteFallback support for `tokenizers`.

Two items added:

- A flag `byte_fallback` for the `BPE` model. This will be in charge
  of using `<0x61>` instead of unk on unknown tokens.
- A ByteFallback decoder, which will be in charge of putting everything
  back into string whenever possible. Showing � when the byte decoding
  fails (behavior checked against LlamaTokenizer in `transformers`.

* Update rustdoc.

* Clippy + Add BPE(byte_fallback) into bindings.

* Stupid file.

* Test artifacts removed.

* Update stub.

* Fix.

* Bad file.

* CRITICAL FIX: wrapper order because of untagged....

* Remove prints.

* Fixing <16 byte fallback.
2023-03-23 16:04:32 +01:00

95 lines
2.7 KiB
TypeScript
Raw Blame History

import {
bpeDecoder,
byteFallbackDecoder,
ctcDecoder,
metaspaceDecoder,
sequenceDecoder,
wordPieceDecoder,
} from "./decoders";
describe("wordPieceDecoder", () => {
it("accepts `undefined` as first parameter", () => {
expect(wordPieceDecoder(undefined)).toBeDefined();
});
it("accepts `undefined` as second parameter", () => {
expect(wordPieceDecoder("test", undefined)).toBeDefined();
});
it("can decode arrays of strings", () => {
expect(
wordPieceDecoder().decode(["Hel", "##lo", "there", "my", "fr", "##iend"])
).toEqual("Hello there my friend");
});
});
describe("byteFallbackDecoder", () => {
it("accepts `undefined` as first parameter", () => {
expect(byteFallbackDecoder()).toBeDefined();
});
it("can decode arrays of strings", () => {
expect(byteFallbackDecoder().decode(["Hel", "lo"])).toEqual("Hello");
expect(byteFallbackDecoder().decode(["<0x61>"])).toEqual("a");
expect(byteFallbackDecoder().decode(["<0x61>"])).toEqual("a");
expect(byteFallbackDecoder().decode(["My", " na", "me"])).toEqual("My name");
expect(byteFallbackDecoder().decode(["<0x61>"])).toEqual("a");
expect(byteFallbackDecoder().decode(["<0xE5>"])).toEqual("<22>");
expect(byteFallbackDecoder().decode(["<0xE5>", "<0x8f>"])).toEqual("<22><>");
expect(byteFallbackDecoder().decode(["<0xE5>", "<0x8f>", "<0xab>"])).toEqual("叫");
expect(byteFallbackDecoder().decode(["<0xE5>", "<0x8f>", "a"])).toEqual("<22><>a");
expect(byteFallbackDecoder().decode(["<0xE5>", "<0x8f>", "<0xab>", "a"])).toEqual(
"叫a"
);
});
});
describe("metaspaceDecoder", () => {
it("accepts `undefined` as first parameter", () => {
expect(metaspaceDecoder(undefined)).toBeDefined();
});
it("accepts `undefined` as second parameter", () => {
expect(metaspaceDecoder("t", undefined)).toBeDefined();
});
});
describe("bpeDecoder", () => {
it("accepts `undefined` as parameter", () => {
expect(bpeDecoder(undefined)).toBeDefined();
});
});
describe("ctcDecoder", () => {
it("accepts `undefined` as parameter", () => {
expect(ctcDecoder(undefined)).toBeDefined();
});
it("encodes correctly", () => {
expect(
ctcDecoder().decode(["<pad>", "h", "h", "e", "e", "l", "l", "<pad>", "l", "l", "o"])
).toEqual("hello");
});
});
describe("sequenceDecoder", () => {
it("accepts `empty list` as parameter", () => {
expect(sequenceDecoder([])).toBeDefined();
});
it("encodes correctly", () => {
expect(
sequenceDecoder([ctcDecoder(), metaspaceDecoder()]).decode([
"▁",
"▁",
"H",
"H",
"i",
"i",
"▁",
"y",
"o",
"u",
])
).toEqual("Hi you");
});
});