Files
tokenizers/bindings/node/lib/bindings/tokenizer.test.ts
Nicolas Patry 152880ab3e Adding truncation_side within TruncationParams. (#860)
* Add truncation to enable_truncation

* Fix typo

* Adding truncation_side within `TruncationParams`.

* Node serialization of this direction param.

* Update the test.

* Fixing warnings/lint.

* Adding stuff (can't local debug :( )

* Slow loop... ;(

* Stub.py.

Co-authored-by: Niels Rogge <niels.rogge1@gmail.com>
2021-12-28 12:37:06 +01:00

443 lines
15 KiB
TypeScript

/* eslint-disable @typescript-eslint/no-explicit-any */
/* eslint-disable @typescript-eslint/no-empty-function */
import { promisify } from "util";
import { PaddingDirection, TruncationDirection, TruncationStrategy } from "./enums";
import { BPE } from "./models";
import { RawEncoding } from "./raw-encoding";
import {
AddedToken,
EncodeInput,
EncodeOptions,
InputSequence,
PaddingConfiguration,
Tokenizer,
TruncationConfiguration,
} from "./tokenizer";
// jest.mock('../bindings/tokenizer');
// jest.mock('../bindings/models', () => ({
// __esModule: true,
// Model: jest.fn()
// }));
// Or:
// jest.mock('../bindings/models', () => {
// return require('../bindings/__mocks__/models');
// });
// const TokenizerMock = mocked(Tokenizer);
describe("AddedToken", () => {
it("instantiates with only content", () => {
const addToken = new AddedToken("test", false);
expect(addToken.constructor.name).toEqual("AddedToken");
});
it("instantiates with empty options", () => {
const addToken = new AddedToken("test", false, {});
expect(addToken.constructor.name).toEqual("AddedToken");
});
it("instantiates with options", () => {
const addToken = new AddedToken("test", false, {
leftStrip: true,
rightStrip: true,
singleWord: true,
});
expect(addToken.constructor.name).toEqual("AddedToken");
});
describe("getContent", () => {
it("returns the string content of AddedToken", () => {
const addedToken = new AddedToken("test", false);
expect(addedToken.getContent()).toEqual("test");
});
});
});
describe("Tokenizer", () => {
it("has expected methods", () => {
const model = BPE.empty();
const tokenizer = new Tokenizer(model);
expect(typeof Tokenizer.fromFile).toBe("function");
expect(typeof Tokenizer.fromString).toBe("function");
expect(typeof Tokenizer.fromPretrained).toBe("function");
expect(typeof tokenizer.addSpecialTokens).toBe("function");
expect(typeof tokenizer.addTokens).toBe("function");
expect(typeof tokenizer.decode).toBe("function");
expect(typeof tokenizer.decodeBatch).toBe("function");
expect(typeof tokenizer.disablePadding).toBe("function");
expect(typeof tokenizer.disableTruncation).toBe("function");
expect(typeof tokenizer.encode).toBe("function");
expect(typeof tokenizer.encodeBatch).toBe("function");
expect(typeof tokenizer.getDecoder).toBe("function");
expect(typeof tokenizer.getNormalizer).toBe("function");
expect(typeof tokenizer.getPostProcessor).toBe("function");
expect(typeof tokenizer.getPreTokenizer).toBe("function");
expect(typeof tokenizer.getVocab).toBe("function");
expect(typeof tokenizer.getVocabSize).toBe("function");
expect(typeof tokenizer.idToToken).toBe("function");
expect(typeof tokenizer.runningTasks).toBe("function");
expect(typeof tokenizer.save).toBe("function");
expect(typeof tokenizer.setDecoder).toBe("function");
expect(typeof tokenizer.setModel).toBe("function");
expect(typeof tokenizer.setNormalizer).toBe("function");
expect(typeof tokenizer.setPadding).toBe("function");
expect(typeof tokenizer.setPostProcessor).toBe("function");
expect(typeof tokenizer.setPreTokenizer).toBe("function");
expect(typeof tokenizer.setTruncation).toBe("function");
expect(typeof tokenizer.tokenToId).toBe("function");
expect(typeof tokenizer.toString).toBe("function");
expect(typeof tokenizer.train).toBe("function");
});
it("can be instantiated from the hub", async () => {
let tokenizer: Tokenizer;
let encode: (
sequence: InputSequence,
pair?: InputSequence | null,
options?: EncodeOptions | null
) => Promise<RawEncoding>;
let output: RawEncoding;
tokenizer = Tokenizer.fromPretrained("bert-base-cased");
encode = promisify(tokenizer.encode.bind(tokenizer));
output = await encode("Hey there dear friend!", null, { addSpecialTokens: false });
expect(output.getTokens()).toEqual(["Hey", "there", "dear", "friend", "!"]);
tokenizer = Tokenizer.fromPretrained("anthony/tokenizers-test");
encode = promisify(tokenizer.encode.bind(tokenizer));
output = await encode("Hey there dear friend!", null, { addSpecialTokens: false });
expect(output.getTokens()).toEqual(["hey", "there", "dear", "friend", "!"]);
tokenizer = Tokenizer.fromPretrained("anthony/tokenizers-test", {
revision: "gpt-2",
});
encode = promisify(tokenizer.encode.bind(tokenizer));
output = await encode("Hey there dear friend!", null, { addSpecialTokens: false });
expect(output.getTokens()).toEqual(["Hey", "Ġthere", "Ġdear", "Ġfriend", "!"]);
});
describe("addTokens", () => {
it("accepts a list of string as new tokens when initial model is empty", () => {
const model = BPE.empty();
const tokenizer = new Tokenizer(model);
const nbAdd = tokenizer.addTokens(["my", "name", "is", "john", "pair"]);
expect(nbAdd).toBe(5);
});
it("accepts a list of AddedToken as new tokens when initial model is empty", () => {
const model = BPE.empty();
const tokenizer = new Tokenizer(model);
const addedToken = new AddedToken("test", false);
const nbAdd = tokenizer.addTokens([addedToken]);
expect(nbAdd).toBe(1);
});
});
describe("encode", () => {
let tokenizer: Tokenizer;
let encode: (
sequence: InputSequence,
pair?: InputSequence | null,
options?: EncodeOptions | null
) => Promise<RawEncoding>;
let encodeBatch: (
inputs: EncodeInput[],
options?: EncodeOptions | null
) => Promise<RawEncoding[]>;
beforeEach(() => {
// Clear all instances and calls to constructor and all methods:
// TokenizerMock.mockClear();
const model = BPE.empty();
tokenizer = new Tokenizer(model);
tokenizer.addTokens(["my", "name", "is", "john", new AddedToken("pair", false)]);
encode = promisify(tokenizer.encode.bind(tokenizer));
encodeBatch = promisify(tokenizer.encodeBatch.bind(tokenizer));
});
it("accepts a pair of strings as parameters", async () => {
const encoding = await encode("my name is john", "pair");
expect(encoding).toBeDefined();
});
it("accepts a string with a null pair", async () => {
const encoding = await encode("my name is john", null);
expect(encoding).toBeDefined();
});
it("throws if we try to encode a pre-tokenized string without isPretokenized=true", async () => {
await expect((encode as any)(["my", "name", "is", "john"], null)).rejects.toThrow(
"encode with isPreTokenized=false expect string"
);
});
it("accepts a pre-tokenized string as parameter", async () => {
const encoding = await encode(["my", "name", "is", "john"], undefined, {
isPretokenized: true,
});
expect(encoding).toBeDefined();
});
it("throws if we try to encodeBatch pre-tokenized strings without isPretokenized=true", async () => {
await expect((encodeBatch as any)([["my", "name", "is", "john"]])).rejects.toThrow(
"encodeBatch with isPretokenized=false expects input to be `EncodeInput[]` " +
"with `EncodeInput = string | [string, string]`"
);
});
it("accepts a pre-tokenized input in encodeBatch", async () => {
const encoding = await encodeBatch([["my", "name", "is", "john"]], {
isPretokenized: true,
});
expect(encoding).toBeDefined();
});
it("Encodes correctly if called with only one argument", async () => {
const encoded = await encode("my name is john");
expect(encoded.getIds()).toEqual([0, 1, 2, 3]);
});
it("returns an Encoding", async () => {
const encoding = await encode("my name is john", "pair");
expect(encoding.getAttentionMask()).toEqual([1, 1, 1, 1, 1]);
const ids = encoding.getIds();
expect(Array.isArray(ids)).toBe(true);
expect(ids).toHaveLength(5);
for (const id of ids) {
expect(typeof id).toBe("number");
}
expect(encoding.getOffsets()).toEqual([
[0, 2],
[3, 7],
[8, 10],
[11, 15],
[0, 4],
]);
expect(encoding.getOverflowing()).toEqual([]);
expect(encoding.getSpecialTokensMask()).toEqual([0, 0, 0, 0, 0]);
expect(encoding.getTokens()).toEqual(["my", "name", "is", "john", "pair"]);
expect(encoding.getTypeIds()).toEqual([0, 0, 0, 0, 1]);
});
describe("when truncation is enabled", () => {
it("truncates with default if no truncation options provided", async () => {
tokenizer.setTruncation(2);
const singleEncoding = await encode("my name is john", null);
expect(singleEncoding.getTokens()).toEqual(["my", "name"]);
const pairEncoding = await encode("my name is john", "pair");
expect(pairEncoding.getTokens()).toEqual(["my", "pair"]);
});
it("throws an error with strategy `only_second` and no pair is encoded", async () => {
tokenizer.setTruncation(2, { strategy: TruncationStrategy.OnlySecond });
await expect(encode("my name is john", null)).rejects.toThrow();
});
});
describe("when padding is enabled", () => {
it("does not pad anything with default options", async () => {
tokenizer.setPadding();
const singleEncoding = await encode("my name", null);
expect(singleEncoding.getTokens()).toEqual(["my", "name"]);
const pairEncoding = await encode("my name", "pair");
expect(pairEncoding.getTokens()).toEqual(["my", "name", "pair"]);
});
it("pads to the right by default", async () => {
tokenizer.setPadding({ maxLength: 5 });
const singleEncoding = await encode("my name", null);
expect(singleEncoding.getTokens()).toEqual([
"my",
"name",
"[PAD]",
"[PAD]",
"[PAD]",
]);
const pairEncoding = await encode("my name", "pair");
expect(pairEncoding.getTokens()).toEqual([
"my",
"name",
"pair",
"[PAD]",
"[PAD]",
]);
});
it("pads to multiple of the given value", async () => {
tokenizer.setPadding({ padToMultipleOf: 8 });
const singleEncoding = await encode("my name", null);
expect(singleEncoding.getTokens()).toHaveLength(8);
const pairEncoding = await encode("my name", "pair");
expect(pairEncoding.getTokens()).toHaveLength(8);
});
});
});
describe("decode", () => {
let tokenizer: Tokenizer;
beforeEach(() => {
const model = BPE.empty();
tokenizer = new Tokenizer(model);
tokenizer.addTokens(["my", "name", "is", "john", "pair"]);
});
it("returns `undefined`", () => {
expect(tokenizer.decode([0, 1, 2, 3], true, () => {})).toBeUndefined();
});
it("has its callback called with the decoded string", async () => {
const decode = promisify(tokenizer.decode.bind(tokenizer));
await expect(decode([0, 1, 2, 3], true)).resolves.toEqual("my name is john");
});
});
describe("decodeBatch", () => {
let tokenizer: Tokenizer;
beforeEach(() => {
const model = BPE.empty();
tokenizer = new Tokenizer(model);
tokenizer.addTokens(["my", "name", "is", "john", "pair"]);
});
it("returns `undefined`", () => {
expect(tokenizer.decodeBatch([[0, 1, 2, 3], [4]], true, () => {})).toBeUndefined();
});
it("has its callback called with the decoded string", async () => {
const decodeBatch = promisify(tokenizer.decodeBatch.bind(tokenizer));
await expect(decodeBatch([[0, 1, 2, 3], [4]], true)).resolves.toEqual([
"my name is john",
"pair",
]);
});
});
describe("getVocab", () => {
it("accepts `undefined` as parameter", () => {
const model = BPE.empty();
const tokenizer = new Tokenizer(model);
expect(tokenizer.getVocab(undefined)).toBeDefined();
});
it("returns the vocabulary", () => {
const model = BPE.empty();
const tokenizer = new Tokenizer(model);
tokenizer.addTokens(["my", "name", "is", "john"]);
expect(tokenizer.getVocab(true)).toEqual({
my: 0,
name: 1,
is: 2,
john: 3,
});
});
});
describe("getVocabSize", () => {
it("accepts `undefined` as parameter", () => {
const model = BPE.empty();
const tokenizer = new Tokenizer(model);
expect(tokenizer.getVocabSize(undefined)).toBeDefined();
});
});
describe("setTruncation", () => {
it("returns the full truncation configuration", () => {
const model = BPE.empty();
const tokenizer = new Tokenizer(model);
const truncation = tokenizer.setTruncation(2);
const expectedConfig: TruncationConfiguration = {
maxLength: 2,
strategy: TruncationStrategy.LongestFirst,
stride: 0,
direction: TruncationDirection.Right,
};
expect(truncation).toEqual(expectedConfig);
});
});
describe("setPadding", () => {
it("returns the full padding params", () => {
const model = BPE.empty();
const tokenizer = new Tokenizer(model);
const padding = tokenizer.setPadding();
const expectedConfig: PaddingConfiguration = {
direction: PaddingDirection.Right,
padId: 0,
padToken: "[PAD]",
padTypeId: 0,
};
expect(padding).toEqual(expectedConfig);
});
});
describe("postProcess", () => {
let tokenizer: Tokenizer;
let encode: (
sequence: InputSequence,
pair?: InputSequence | null,
options?: EncodeOptions | null
) => Promise<RawEncoding>;
let firstEncoding: RawEncoding;
let secondEncoding: RawEncoding;
beforeAll(() => {
const model = BPE.empty();
tokenizer = new Tokenizer(model);
tokenizer.addTokens(["my", "name", "is", "john", "pair"]);
encode = promisify(tokenizer.encode.bind(tokenizer));
});
beforeEach(async () => {
firstEncoding = await encode("my name is john", null);
secondEncoding = await encode("pair", null);
tokenizer.setTruncation(2);
tokenizer.setPadding({ maxLength: 5 });
});
it("returns correctly with a single Encoding param", () => {
const encoding = tokenizer.postProcess(firstEncoding);
expect(encoding.getTokens()).toEqual(["my", "name", "[PAD]", "[PAD]", "[PAD]"]);
});
it("returns correctly with `undefined` as second and third parameters", () => {
const encoding = tokenizer.postProcess(firstEncoding, undefined, undefined);
expect(encoding.getTokens()).toEqual(["my", "name", "[PAD]", "[PAD]", "[PAD]"]);
});
it("returns correctly with 2 encodings", () => {
const encoding = tokenizer.postProcess(firstEncoding, secondEncoding);
expect(encoding.getTokens()).toEqual(["my", "pair", "[PAD]", "[PAD]", "[PAD]"]);
});
});
});