mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
* start playing around * make a first version * refactor * apply make format * add python bindings * add some python binding tests * correct pre-tokenizers * update auto-generated bindings * lint python bindings * add code node * add split to docs * refactor python binding a bit * cargo fmt * clippy and fmt in node * quick updates and fixes * Oops * Update node typings * Update changelog Co-authored-by: Anthony MOI <m.anthony.moi@gmail.com>
65 lines
2.0 KiB
TypeScript
65 lines
2.0 KiB
TypeScript
import {
|
|
byteLevelPreTokenizer,
|
|
metaspacePreTokenizer,
|
|
punctuationPreTokenizer,
|
|
sequencePreTokenizer,
|
|
splitPreTokenizer,
|
|
whitespaceSplitPreTokenizer,
|
|
} from "./pre-tokenizers";
|
|
|
|
describe("byteLevelPreTokenizer", () => {
|
|
it("instantiates correctly", () => {
|
|
const processor = byteLevelPreTokenizer();
|
|
expect(processor.constructor.name).toEqual("PreTokenizer");
|
|
});
|
|
});
|
|
|
|
describe("metaspacePreTokenizer", () => {
|
|
it("instantiates correctly without any parameter", () => {
|
|
const processor = metaspacePreTokenizer();
|
|
expect(processor.constructor.name).toEqual("PreTokenizer");
|
|
});
|
|
|
|
it("accepts `undefined` as first parameter", () => {
|
|
expect(metaspacePreTokenizer(undefined)).toBeDefined();
|
|
});
|
|
|
|
it("accepts `undefined` as second parameter", () => {
|
|
expect(metaspacePreTokenizer("t", undefined)).toBeDefined();
|
|
});
|
|
|
|
it("can pre-tokenize strings", () => {
|
|
const pretok = metaspacePreTokenizer();
|
|
expect(pretok.preTokenizeString("Hello there friend")).toEqual([
|
|
["▁Hello", [0, 5]],
|
|
["▁there", [5, 11]],
|
|
["▁friend", [11, 18]],
|
|
]);
|
|
});
|
|
});
|
|
|
|
describe("punctuationPreTokenizer", () => {
|
|
it("instantiates correctly without any parameter", () => {
|
|
const processor = punctuationPreTokenizer();
|
|
expect(processor.constructor.name).toEqual("PreTokenizer");
|
|
});
|
|
});
|
|
|
|
describe("splitPreTokenizer", () => {
|
|
it("instantiates correctly with invert parameter", () => {
|
|
const processor = splitPreTokenizer(" ", "mergedWithPrevious", false);
|
|
expect(processor.constructor.name).toEqual("PreTokenizer");
|
|
});
|
|
});
|
|
|
|
describe("sequencePreTokenizer", () => {
|
|
it("instantiates correctly", () => {
|
|
const punctuation = punctuationPreTokenizer();
|
|
const whitespace = whitespaceSplitPreTokenizer();
|
|
const sequence2 = sequencePreTokenizer([]);
|
|
expect(sequence2.constructor.name).toEqual("PreTokenizer");
|
|
const sequence3 = sequencePreTokenizer([punctuation, whitespace]);
|
|
expect(sequence3.constructor.name).toEqual("PreTokenizer");
|
|
});
|
|
});
|