mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
* Adding ByteFallback support for `tokenizers`. Two items added: - A flag `byte_fallback` for the `BPE` model. This will be in charge of using `<0x61>` instead of unk on unknown tokens. - A ByteFallback decoder, which will be in charge of putting everything back into string whenever possible. Showing � when the byte decoding fails (behavior checked against LlamaTokenizer in `transformers`. * Update rustdoc. * Clippy + Add BPE(byte_fallback) into bindings. * Stupid file. * Test artifacts removed. * Update stub. * Fix. * Bad file. * CRITICAL FIX: wrapper order because of untagged.... * Remove prints. * Fixing <16 byte fallback.
95 lines
2.7 KiB
TypeScript
95 lines
2.7 KiB
TypeScript
import {
|
||
bpeDecoder,
|
||
byteFallbackDecoder,
|
||
ctcDecoder,
|
||
metaspaceDecoder,
|
||
sequenceDecoder,
|
||
wordPieceDecoder,
|
||
} from "./decoders";
|
||
|
||
describe("wordPieceDecoder", () => {
|
||
it("accepts `undefined` as first parameter", () => {
|
||
expect(wordPieceDecoder(undefined)).toBeDefined();
|
||
});
|
||
|
||
it("accepts `undefined` as second parameter", () => {
|
||
expect(wordPieceDecoder("test", undefined)).toBeDefined();
|
||
});
|
||
|
||
it("can decode arrays of strings", () => {
|
||
expect(
|
||
wordPieceDecoder().decode(["Hel", "##lo", "there", "my", "fr", "##iend"])
|
||
).toEqual("Hello there my friend");
|
||
});
|
||
});
|
||
|
||
describe("byteFallbackDecoder", () => {
|
||
it("accepts `undefined` as first parameter", () => {
|
||
expect(byteFallbackDecoder()).toBeDefined();
|
||
});
|
||
|
||
it("can decode arrays of strings", () => {
|
||
expect(byteFallbackDecoder().decode(["Hel", "lo"])).toEqual("Hello");
|
||
expect(byteFallbackDecoder().decode(["<0x61>"])).toEqual("a");
|
||
expect(byteFallbackDecoder().decode(["<0x61>"])).toEqual("a");
|
||
expect(byteFallbackDecoder().decode(["My", " na", "me"])).toEqual("My name");
|
||
expect(byteFallbackDecoder().decode(["<0x61>"])).toEqual("a");
|
||
expect(byteFallbackDecoder().decode(["<0xE5>"])).toEqual("<22>");
|
||
expect(byteFallbackDecoder().decode(["<0xE5>", "<0x8f>"])).toEqual("<22><>");
|
||
expect(byteFallbackDecoder().decode(["<0xE5>", "<0x8f>", "<0xab>"])).toEqual("叫");
|
||
expect(byteFallbackDecoder().decode(["<0xE5>", "<0x8f>", "a"])).toEqual("<22><>a");
|
||
expect(byteFallbackDecoder().decode(["<0xE5>", "<0x8f>", "<0xab>", "a"])).toEqual(
|
||
"叫a"
|
||
);
|
||
});
|
||
});
|
||
|
||
describe("metaspaceDecoder", () => {
|
||
it("accepts `undefined` as first parameter", () => {
|
||
expect(metaspaceDecoder(undefined)).toBeDefined();
|
||
});
|
||
|
||
it("accepts `undefined` as second parameter", () => {
|
||
expect(metaspaceDecoder("t", undefined)).toBeDefined();
|
||
});
|
||
});
|
||
|
||
describe("bpeDecoder", () => {
|
||
it("accepts `undefined` as parameter", () => {
|
||
expect(bpeDecoder(undefined)).toBeDefined();
|
||
});
|
||
});
|
||
|
||
describe("ctcDecoder", () => {
|
||
it("accepts `undefined` as parameter", () => {
|
||
expect(ctcDecoder(undefined)).toBeDefined();
|
||
});
|
||
it("encodes correctly", () => {
|
||
expect(
|
||
ctcDecoder().decode(["<pad>", "h", "h", "e", "e", "l", "l", "<pad>", "l", "l", "o"])
|
||
).toEqual("hello");
|
||
});
|
||
});
|
||
|
||
describe("sequenceDecoder", () => {
|
||
it("accepts `empty list` as parameter", () => {
|
||
expect(sequenceDecoder([])).toBeDefined();
|
||
});
|
||
it("encodes correctly", () => {
|
||
expect(
|
||
sequenceDecoder([ctcDecoder(), metaspaceDecoder()]).decode([
|
||
"▁",
|
||
"▁",
|
||
"H",
|
||
"H",
|
||
"i",
|
||
"i",
|
||
"▁",
|
||
"y",
|
||
"o",
|
||
"u",
|
||
])
|
||
).toEqual("Hi you");
|
||
});
|
||
});
|