mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-30 12:09:30 +00:00
Node - Fix new linting errors
This commit is contained in:
@ -4,5 +4,5 @@ module.exports = {
|
|||||||
byteLevelDecoder: native.decoders_ByteLevel,
|
byteLevelDecoder: native.decoders_ByteLevel,
|
||||||
wordPieceDecoder: native.decoders_WordPiece,
|
wordPieceDecoder: native.decoders_WordPiece,
|
||||||
metaspaceDecoder: native.decoders_Metaspace,
|
metaspaceDecoder: native.decoders_Metaspace,
|
||||||
bpeDecoder: native.decoders_BPEDecoder
|
bpeDecoder: native.decoders_BPEDecoder,
|
||||||
};
|
};
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
export enum TruncationStrategy {
|
export enum TruncationStrategy {
|
||||||
LongestFirst = "longest_first",
|
LongestFirst = "longest_first",
|
||||||
OnlyFirst = "only_first",
|
OnlyFirst = "only_first",
|
||||||
OnlySecond = "only_second"
|
OnlySecond = "only_second",
|
||||||
}
|
}
|
||||||
|
|
||||||
export enum PaddingDirection {
|
export enum PaddingDirection {
|
||||||
Left = "left",
|
Left = "left",
|
||||||
Right = "right"
|
Right = "right",
|
||||||
}
|
}
|
||||||
|
@ -3,10 +3,10 @@ const native = require("./native");
|
|||||||
module.exports = {
|
module.exports = {
|
||||||
BPE: {
|
BPE: {
|
||||||
fromFiles: native.models_BPE_from_files,
|
fromFiles: native.models_BPE_from_files,
|
||||||
empty: native.models_BPE_empty
|
empty: native.models_BPE_empty,
|
||||||
},
|
},
|
||||||
WordPiece: {
|
WordPiece: {
|
||||||
fromFiles: native.models_WordPiece_from_files,
|
fromFiles: native.models_WordPiece_from_files,
|
||||||
empty: native.models_WordPiece_empty
|
empty: native.models_WordPiece_empty,
|
||||||
}
|
},
|
||||||
};
|
};
|
||||||
|
@ -23,7 +23,7 @@ describe("WordPiece", () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
it("has its callback called with the loaded model", () => {
|
it("has its callback called with the loaded model", () => {
|
||||||
return new Promise(done => {
|
return new Promise((done) => {
|
||||||
WordPiece.fromFiles(`${MOCKS_DIR}/vocab.txt`, (err, model) => {
|
WordPiece.fromFiles(`${MOCKS_DIR}/vocab.txt`, (err, model) => {
|
||||||
expect(model).toBeDefined();
|
expect(model).toBeDefined();
|
||||||
done();
|
done();
|
||||||
@ -40,7 +40,7 @@ describe("WordPiece", () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
it("has its callback called with the loaded model", () => {
|
it("has its callback called with the loaded model", () => {
|
||||||
return new Promise(done => {
|
return new Promise((done) => {
|
||||||
WordPiece.fromFiles(`${MOCKS_DIR}/vocab.txt`, {}, (err, model) => {
|
WordPiece.fromFiles(`${MOCKS_DIR}/vocab.txt`, {}, (err, model) => {
|
||||||
expect(model).toBeDefined();
|
expect(model).toBeDefined();
|
||||||
done();
|
done();
|
||||||
@ -72,7 +72,7 @@ describe("BPE", () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
it("has its callback called with the loaded model", () => {
|
it("has its callback called with the loaded model", () => {
|
||||||
return new Promise(done => {
|
return new Promise((done) => {
|
||||||
BPE.fromFiles(
|
BPE.fromFiles(
|
||||||
`${MOCKS_DIR}/vocab.json`,
|
`${MOCKS_DIR}/vocab.json`,
|
||||||
`${MOCKS_DIR}/merges.txt`,
|
`${MOCKS_DIR}/merges.txt`,
|
||||||
@ -93,7 +93,7 @@ describe("BPE", () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
it("has its callback called with the loaded model", () => {
|
it("has its callback called with the loaded model", () => {
|
||||||
return new Promise(done => {
|
return new Promise((done) => {
|
||||||
BPE.fromFiles(
|
BPE.fromFiles(
|
||||||
`${MOCKS_DIR}/vocab.json`,
|
`${MOCKS_DIR}/vocab.json`,
|
||||||
`${MOCKS_DIR}/merges.txt`,
|
`${MOCKS_DIR}/merges.txt`,
|
||||||
|
@ -8,5 +8,5 @@ module.exports = {
|
|||||||
nfkcNormalizer: native.normalizers_NFKC,
|
nfkcNormalizer: native.normalizers_NFKC,
|
||||||
nfkdNormalizer: native.normalizers_NFKD,
|
nfkdNormalizer: native.normalizers_NFKD,
|
||||||
sequenceNormalizer: native.normalizers_Sequence,
|
sequenceNormalizer: native.normalizers_Sequence,
|
||||||
stripNormalizer: native.normalizers_Strip
|
stripNormalizer: native.normalizers_Strip,
|
||||||
};
|
};
|
||||||
|
@ -3,5 +3,5 @@ const native = require("./native");
|
|||||||
module.exports = {
|
module.exports = {
|
||||||
bertProcessing: native.processors_BertProcessing,
|
bertProcessing: native.processors_BertProcessing,
|
||||||
byteLevelProcessing: native.processors_ByteLevel,
|
byteLevelProcessing: native.processors_ByteLevel,
|
||||||
robertaProcessing: native.processors_RobertaProcessing
|
robertaProcessing: native.processors_RobertaProcessing,
|
||||||
};
|
};
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
import {
|
import {
|
||||||
bertProcessing,
|
bertProcessing,
|
||||||
byteLevelProcessing,
|
byteLevelProcessing,
|
||||||
robertaProcessing
|
robertaProcessing,
|
||||||
} from "./post-processors";
|
} from "./post-processors";
|
||||||
|
|
||||||
describe("bertProcessing", () => {
|
describe("bertProcessing", () => {
|
||||||
|
@ -7,5 +7,5 @@ module.exports = {
|
|||||||
whitespaceSplitPreTokenizer: native.pre_tokenizers_WhitespaceSplit,
|
whitespaceSplitPreTokenizer: native.pre_tokenizers_WhitespaceSplit,
|
||||||
bertPreTokenizer: native.pre_tokenizers_BertPreTokenizer,
|
bertPreTokenizer: native.pre_tokenizers_BertPreTokenizer,
|
||||||
metaspacePreTokenizer: native.pre_tokenizers_Metaspace,
|
metaspacePreTokenizer: native.pre_tokenizers_Metaspace,
|
||||||
charDelimiterSplitPreTokenizer: native.pre_tokenizers_CharDelimiterSplit
|
charDelimiterSplitPreTokenizer: native.pre_tokenizers_CharDelimiterSplit,
|
||||||
};
|
};
|
||||||
|
@ -21,7 +21,7 @@ describe("RawEncoding", () => {
|
|||||||
const model = await promisify<string, WordPieceOptions, Model>(WordPiece.fromFiles)(
|
const model = await promisify<string, WordPieceOptions, Model>(WordPiece.fromFiles)(
|
||||||
`${MOCKS_DIR}/vocab.txt`,
|
`${MOCKS_DIR}/vocab.txt`,
|
||||||
{
|
{
|
||||||
continuingSubwordPrefix: "##"
|
continuingSubwordPrefix: "##",
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -155,7 +155,7 @@ describe("RawEncoding", () => {
|
|||||||
direction: PaddingDirection.Left,
|
direction: PaddingDirection.Left,
|
||||||
padToken: "[PA]",
|
padToken: "[PA]",
|
||||||
padTypeId: 10,
|
padTypeId: 10,
|
||||||
padId: 400
|
padId: 400,
|
||||||
});
|
});
|
||||||
|
|
||||||
const tokens = encoding.getTokens();
|
const tokens = encoding.getTokens();
|
||||||
|
@ -7,5 +7,5 @@ class Tokenizer extends native.tokenizer_Tokenizer {
|
|||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
AddedToken: native.tokenizer_AddedToken,
|
AddedToken: native.tokenizer_AddedToken,
|
||||||
Tokenizer
|
Tokenizer,
|
||||||
};
|
};
|
||||||
|
@ -14,7 +14,7 @@ import {
|
|||||||
InputSequence,
|
InputSequence,
|
||||||
PaddingConfiguration,
|
PaddingConfiguration,
|
||||||
Tokenizer,
|
Tokenizer,
|
||||||
TruncationConfiguration
|
TruncationConfiguration,
|
||||||
} from "./tokenizer";
|
} from "./tokenizer";
|
||||||
|
|
||||||
// jest.mock('../bindings/tokenizer');
|
// jest.mock('../bindings/tokenizer');
|
||||||
@ -45,7 +45,7 @@ describe("AddedToken", () => {
|
|||||||
const addToken = new AddedToken("test", false, {
|
const addToken = new AddedToken("test", false, {
|
||||||
leftStrip: true,
|
leftStrip: true,
|
||||||
rightStrip: true,
|
rightStrip: true,
|
||||||
singleWord: true
|
singleWord: true,
|
||||||
});
|
});
|
||||||
expect(addToken.constructor.name).toEqual("AddedToken");
|
expect(addToken.constructor.name).toEqual("AddedToken");
|
||||||
});
|
});
|
||||||
@ -156,7 +156,7 @@ describe("Tokenizer", () => {
|
|||||||
|
|
||||||
it("accepts a pre-tokenized string as parameter", async () => {
|
it("accepts a pre-tokenized string as parameter", async () => {
|
||||||
const encoding = await encode(["my", "name", "is", "john"], undefined, {
|
const encoding = await encode(["my", "name", "is", "john"], undefined, {
|
||||||
isPretokenized: true
|
isPretokenized: true,
|
||||||
});
|
});
|
||||||
expect(encoding).toBeDefined();
|
expect(encoding).toBeDefined();
|
||||||
});
|
});
|
||||||
@ -170,7 +170,7 @@ describe("Tokenizer", () => {
|
|||||||
|
|
||||||
it("accepts a pre-tokenized input in encodeBatch", async () => {
|
it("accepts a pre-tokenized input in encodeBatch", async () => {
|
||||||
const encoding = await encodeBatch([["my", "name", "is", "john"]], {
|
const encoding = await encodeBatch([["my", "name", "is", "john"]], {
|
||||||
isPretokenized: true
|
isPretokenized: true,
|
||||||
});
|
});
|
||||||
expect(encoding).toBeDefined();
|
expect(encoding).toBeDefined();
|
||||||
});
|
});
|
||||||
@ -198,7 +198,7 @@ describe("Tokenizer", () => {
|
|||||||
[3, 7],
|
[3, 7],
|
||||||
[8, 10],
|
[8, 10],
|
||||||
[11, 15],
|
[11, 15],
|
||||||
[0, 4]
|
[0, 4],
|
||||||
]);
|
]);
|
||||||
expect(encoding.getOverflowing()).toEqual([]);
|
expect(encoding.getOverflowing()).toEqual([]);
|
||||||
expect(encoding.getSpecialTokensMask()).toEqual([0, 0, 0, 0, 0]);
|
expect(encoding.getSpecialTokensMask()).toEqual([0, 0, 0, 0, 0]);
|
||||||
@ -243,7 +243,7 @@ describe("Tokenizer", () => {
|
|||||||
"name",
|
"name",
|
||||||
"[PAD]",
|
"[PAD]",
|
||||||
"[PAD]",
|
"[PAD]",
|
||||||
"[PAD]"
|
"[PAD]",
|
||||||
]);
|
]);
|
||||||
|
|
||||||
const pairEncoding = await encode("my name", "pair");
|
const pairEncoding = await encode("my name", "pair");
|
||||||
@ -252,7 +252,7 @@ describe("Tokenizer", () => {
|
|||||||
"name",
|
"name",
|
||||||
"pair",
|
"pair",
|
||||||
"[PAD]",
|
"[PAD]",
|
||||||
"[PAD]"
|
"[PAD]",
|
||||||
]);
|
]);
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -304,7 +304,7 @@ describe("Tokenizer", () => {
|
|||||||
const decodeBatch = promisify(tokenizer.decodeBatch.bind(tokenizer));
|
const decodeBatch = promisify(tokenizer.decodeBatch.bind(tokenizer));
|
||||||
await expect(decodeBatch([[0, 1, 2, 3], [4]], true)).resolves.toEqual([
|
await expect(decodeBatch([[0, 1, 2, 3], [4]], true)).resolves.toEqual([
|
||||||
"my name is john",
|
"my name is john",
|
||||||
"pair"
|
"pair",
|
||||||
]);
|
]);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@ -326,7 +326,7 @@ describe("Tokenizer", () => {
|
|||||||
my: 0,
|
my: 0,
|
||||||
name: 1,
|
name: 1,
|
||||||
is: 2,
|
is: 2,
|
||||||
john: 3
|
john: 3,
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@ -349,7 +349,7 @@ describe("Tokenizer", () => {
|
|||||||
const expectedConfig: TruncationConfiguration = {
|
const expectedConfig: TruncationConfiguration = {
|
||||||
maxLength: 2,
|
maxLength: 2,
|
||||||
strategy: TruncationStrategy.LongestFirst,
|
strategy: TruncationStrategy.LongestFirst,
|
||||||
stride: 0
|
stride: 0,
|
||||||
};
|
};
|
||||||
expect(truncation).toEqual(expectedConfig);
|
expect(truncation).toEqual(expectedConfig);
|
||||||
});
|
});
|
||||||
@ -365,7 +365,7 @@ describe("Tokenizer", () => {
|
|||||||
direction: PaddingDirection.Right,
|
direction: PaddingDirection.Right,
|
||||||
padId: 0,
|
padId: 0,
|
||||||
padToken: "[PAD]",
|
padToken: "[PAD]",
|
||||||
padTypeId: 0
|
padTypeId: 0,
|
||||||
};
|
};
|
||||||
expect(padding).toEqual(expectedConfig);
|
expect(padding).toEqual(expectedConfig);
|
||||||
});
|
});
|
||||||
|
@ -2,5 +2,5 @@ const native = require("./native");
|
|||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
bpeTrainer: native.trainers_BPETrainer,
|
bpeTrainer: native.trainers_BPETrainer,
|
||||||
wordPieceTrainer: native.trainers_WordPieceTrainer
|
wordPieceTrainer: native.trainers_WordPieceTrainer,
|
||||||
};
|
};
|
||||||
|
@ -2,5 +2,5 @@ const native = require("./native");
|
|||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
mergeEncodings: native.utils_mergeEncodings,
|
mergeEncodings: native.utils_mergeEncodings,
|
||||||
slice: native.utils_slice
|
slice: native.utils_slice,
|
||||||
};
|
};
|
||||||
|
@ -141,7 +141,7 @@ describe("mergeEncodings", () => {
|
|||||||
[0, 2],
|
[0, 2],
|
||||||
[3, 7],
|
[3, 7],
|
||||||
[8, 10],
|
[8, 10],
|
||||||
[0, 4]
|
[0, 4],
|
||||||
]);
|
]);
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -155,7 +155,7 @@ describe("mergeEncodings", () => {
|
|||||||
[0, 2],
|
[0, 2],
|
||||||
[3, 7],
|
[3, 7],
|
||||||
[8, 10],
|
[8, 10],
|
||||||
[0, 4]
|
[0, 4],
|
||||||
]);
|
]);
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -169,7 +169,7 @@ describe("mergeEncodings", () => {
|
|||||||
[0, 2],
|
[0, 2],
|
||||||
[3, 7],
|
[3, 7],
|
||||||
[8, 10],
|
[8, 10],
|
||||||
[10, 14]
|
[10, 14],
|
||||||
]);
|
]);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
@ -10,7 +10,7 @@ describe("Encoding", () => {
|
|||||||
describe("ids", () => {
|
describe("ids", () => {
|
||||||
const getIdsMock = jest.fn(() => [3]);
|
const getIdsMock = jest.fn(() => [3]);
|
||||||
const m = rawEncodingMock.mockImplementation(() => ({
|
const m = rawEncodingMock.mockImplementation(() => ({
|
||||||
getIds: getIdsMock
|
getIds: getIdsMock,
|
||||||
}));
|
}));
|
||||||
|
|
||||||
encoding = new Encoding(m() as RawEncoding);
|
encoding = new Encoding(m() as RawEncoding);
|
||||||
@ -36,7 +36,7 @@ describe("Encoding", () => {
|
|||||||
const getIdsMock = jest.fn(() => [4]);
|
const getIdsMock = jest.fn(() => [4]);
|
||||||
const m = rawEncodingMock.mockImplementation(() => ({
|
const m = rawEncodingMock.mockImplementation(() => ({
|
||||||
getIds: getIdsMock,
|
getIds: getIdsMock,
|
||||||
pad: jest.fn()
|
pad: jest.fn(),
|
||||||
}));
|
}));
|
||||||
|
|
||||||
encoding = new Encoding(m() as RawEncoding);
|
encoding = new Encoding(m() as RawEncoding);
|
||||||
@ -55,7 +55,7 @@ describe("Encoding", () => {
|
|||||||
const getIdsMock = jest.fn(() => [4]);
|
const getIdsMock = jest.fn(() => [4]);
|
||||||
const m = rawEncodingMock.mockImplementation(() => ({
|
const m = rawEncodingMock.mockImplementation(() => ({
|
||||||
getIds: getIdsMock,
|
getIds: getIdsMock,
|
||||||
truncate: jest.fn()
|
truncate: jest.fn(),
|
||||||
}));
|
}));
|
||||||
|
|
||||||
encoding = new Encoding(m() as RawEncoding);
|
encoding = new Encoding(m() as RawEncoding);
|
||||||
|
@ -21,7 +21,7 @@ export class Encoding {
|
|||||||
*/
|
*/
|
||||||
static merge(encodings: Encoding[], growingOffsets?: boolean): Encoding {
|
static merge(encodings: Encoding[], growingOffsets?: boolean): Encoding {
|
||||||
const mergedRaw = mergeEncodings(
|
const mergedRaw = mergeEncodings(
|
||||||
encodings.map(e => e.rawEncoding),
|
encodings.map((e) => e.rawEncoding),
|
||||||
growingOffsets
|
growingOffsets
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -82,7 +82,7 @@ export class Encoding {
|
|||||||
|
|
||||||
return (this._overflowing = this._rawEncoding
|
return (this._overflowing = this._rawEncoding
|
||||||
.getOverflowing()
|
.getOverflowing()
|
||||||
.map(e => new Encoding(e)));
|
.map((e) => new Encoding(e)));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -218,7 +218,7 @@ export class Encoding {
|
|||||||
"_specialTokensMask",
|
"_specialTokensMask",
|
||||||
"_tokens",
|
"_tokens",
|
||||||
"_typeIds",
|
"_typeIds",
|
||||||
"_wordIndexes"
|
"_wordIndexes",
|
||||||
]) {
|
]) {
|
||||||
delete this[prop as keyof this];
|
delete this[prop as keyof this];
|
||||||
}
|
}
|
||||||
|
@ -3,12 +3,12 @@ import { BPE } from "../../bindings/models";
|
|||||||
import {
|
import {
|
||||||
PaddingConfiguration,
|
PaddingConfiguration,
|
||||||
Tokenizer,
|
Tokenizer,
|
||||||
TruncationConfiguration
|
TruncationConfiguration,
|
||||||
} from "../../bindings/tokenizer";
|
} from "../../bindings/tokenizer";
|
||||||
import { BaseTokenizer } from "./base.tokenizer";
|
import { BaseTokenizer } from "./base.tokenizer";
|
||||||
|
|
||||||
describe("BaseTokenizer", () => {
|
describe("BaseTokenizer", () => {
|
||||||
let tokenizer: BaseTokenizer<{}>;
|
let tokenizer: BaseTokenizer<Record<string, unknown>>;
|
||||||
|
|
||||||
beforeEach(() => {
|
beforeEach(() => {
|
||||||
// Clear all instances and calls to constructor and all methods:
|
// Clear all instances and calls to constructor and all methods:
|
||||||
@ -29,7 +29,7 @@ describe("BaseTokenizer", () => {
|
|||||||
const expectedConfig: TruncationConfiguration = {
|
const expectedConfig: TruncationConfiguration = {
|
||||||
maxLength: 2,
|
maxLength: 2,
|
||||||
strategy: TruncationStrategy.LongestFirst,
|
strategy: TruncationStrategy.LongestFirst,
|
||||||
stride: 0
|
stride: 0,
|
||||||
};
|
};
|
||||||
expect(tokenizer.truncation).toEqual(expectedConfig);
|
expect(tokenizer.truncation).toEqual(expectedConfig);
|
||||||
});
|
});
|
||||||
@ -52,7 +52,7 @@ describe("BaseTokenizer", () => {
|
|||||||
direction: PaddingDirection.Right,
|
direction: PaddingDirection.Right,
|
||||||
padId: 0,
|
padId: 0,
|
||||||
padToken: "[PAD]",
|
padToken: "[PAD]",
|
||||||
padTypeId: 0
|
padTypeId: 0,
|
||||||
};
|
};
|
||||||
expect(tokenizer.padding).toEqual(expectedConfig);
|
expect(tokenizer.padding).toEqual(expectedConfig);
|
||||||
});
|
});
|
||||||
|
@ -10,12 +10,13 @@ import {
|
|||||||
PaddingOptions,
|
PaddingOptions,
|
||||||
Tokenizer,
|
Tokenizer,
|
||||||
TruncationConfiguration,
|
TruncationConfiguration,
|
||||||
TruncationOptions
|
TruncationOptions,
|
||||||
} from "../../bindings/tokenizer";
|
} from "../../bindings/tokenizer";
|
||||||
import { Encoding } from "../encoding";
|
import { Encoding } from "../encoding";
|
||||||
|
|
||||||
export type Token = string | AddedToken;
|
export type Token = string | AddedToken;
|
||||||
|
|
||||||
|
// eslint-disable-next-line @typescript-eslint/ban-types
|
||||||
export class BaseTokenizer<TConfig extends object> {
|
export class BaseTokenizer<TConfig extends object> {
|
||||||
private _truncation?: TruncationConfiguration;
|
private _truncation?: TruncationConfiguration;
|
||||||
private _padding?: PaddingConfiguration;
|
private _padding?: PaddingConfiguration;
|
||||||
@ -114,7 +115,7 @@ export class BaseTokenizer<TConfig extends object> {
|
|||||||
): Promise<Encoding[]> {
|
): Promise<Encoding[]> {
|
||||||
const encodeBatch = promisify(this.tokenizer.encodeBatch.bind(this.tokenizer));
|
const encodeBatch = promisify(this.tokenizer.encodeBatch.bind(this.tokenizer));
|
||||||
const rawEncodings = await encodeBatch(sequences, options);
|
const rawEncodings = await encodeBatch(sequences, options);
|
||||||
return rawEncodings.map(e => new Encoding(e));
|
return rawEncodings.map((e) => new Encoding(e));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -14,7 +14,7 @@ describe("BertWordPieceTokenizer", () => {
|
|||||||
describe("when a vocabFile is provided and `addSpecialTokens === true`", () => {
|
describe("when a vocabFile is provided and `addSpecialTokens === true`", () => {
|
||||||
it("throws a `sepToken error` if no `sepToken` is provided", async () => {
|
it("throws a `sepToken error` if no `sepToken` is provided", async () => {
|
||||||
const options: BertWordPieceOptions = {
|
const options: BertWordPieceOptions = {
|
||||||
vocabFile: MOCKS_DIR + "/bert-vocab-empty.txt"
|
vocabFile: MOCKS_DIR + "/bert-vocab-empty.txt",
|
||||||
};
|
};
|
||||||
|
|
||||||
await expect(BertWordPieceTokenizer.fromOptions(options)).rejects.toThrow(
|
await expect(BertWordPieceTokenizer.fromOptions(options)).rejects.toThrow(
|
||||||
@ -24,7 +24,7 @@ describe("BertWordPieceTokenizer", () => {
|
|||||||
|
|
||||||
it("throws a `clsToken error` if no `clsToken` is provided", async () => {
|
it("throws a `clsToken error` if no `clsToken` is provided", async () => {
|
||||||
const options: BertWordPieceOptions = {
|
const options: BertWordPieceOptions = {
|
||||||
vocabFile: MOCKS_DIR + "/bert-vocab-without-cls.txt"
|
vocabFile: MOCKS_DIR + "/bert-vocab-without-cls.txt",
|
||||||
};
|
};
|
||||||
|
|
||||||
await expect(BertWordPieceTokenizer.fromOptions(options)).rejects.toThrow(
|
await expect(BertWordPieceTokenizer.fromOptions(options)).rejects.toThrow(
|
||||||
|
@ -104,7 +104,7 @@ export class BertWordPieceTokenizer extends BaseTokenizer<BertTokenizerConfig> {
|
|||||||
sepToken: "[SEP]",
|
sepToken: "[SEP]",
|
||||||
stripAccents: true,
|
stripAccents: true,
|
||||||
unkToken: "[UNK]",
|
unkToken: "[UNK]",
|
||||||
wordpiecesPrefix: "##"
|
wordpiecesPrefix: "##",
|
||||||
};
|
};
|
||||||
|
|
||||||
private readonly defaultTrainOptions: Required<BertWordPieceTrainOptions> = {
|
private readonly defaultTrainOptions: Required<BertWordPieceTrainOptions> = {
|
||||||
@ -114,7 +114,7 @@ export class BertWordPieceTokenizer extends BaseTokenizer<BertTokenizerConfig> {
|
|||||||
showProgress: true,
|
showProgress: true,
|
||||||
specialTokens: ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
|
specialTokens: ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
|
||||||
vocabSize: 30000,
|
vocabSize: 30000,
|
||||||
wordpiecesPrefix: "##"
|
wordpiecesPrefix: "##",
|
||||||
};
|
};
|
||||||
|
|
||||||
private constructor(tokenizer: Tokenizer, configuration: BertTokenizerConfig) {
|
private constructor(tokenizer: Tokenizer, configuration: BertTokenizerConfig) {
|
||||||
@ -135,7 +135,7 @@ export class BertWordPieceTokenizer extends BaseTokenizer<BertTokenizerConfig> {
|
|||||||
const fromFiles = promisify<string, WordPieceOptions, Model>(WordPiece.fromFiles);
|
const fromFiles = promisify<string, WordPieceOptions, Model>(WordPiece.fromFiles);
|
||||||
model = await fromFiles(opts.vocabFile, {
|
model = await fromFiles(opts.vocabFile, {
|
||||||
unkToken: getTokenContent(opts.unkToken),
|
unkToken: getTokenContent(opts.unkToken),
|
||||||
continuingSubwordPrefix: opts.wordpiecesPrefix
|
continuingSubwordPrefix: opts.wordpiecesPrefix,
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
model = WordPiece.empty();
|
model = WordPiece.empty();
|
||||||
@ -148,7 +148,7 @@ export class BertWordPieceTokenizer extends BaseTokenizer<BertTokenizerConfig> {
|
|||||||
opts.sepToken,
|
opts.sepToken,
|
||||||
opts.unkToken,
|
opts.unkToken,
|
||||||
opts.padToken,
|
opts.padToken,
|
||||||
opts.maskToken
|
opts.maskToken,
|
||||||
]) {
|
]) {
|
||||||
if (tokenizer.tokenToId(getTokenContent(token)) !== undefined) {
|
if (tokenizer.tokenToId(getTokenContent(token)) !== undefined) {
|
||||||
tokenizer.addSpecialTokens([token]);
|
tokenizer.addSpecialTokens([token]);
|
||||||
|
@ -5,7 +5,7 @@ import { BPE, BPEOptions, Model } from "../../bindings/models";
|
|||||||
import {
|
import {
|
||||||
lowercaseNormalizer,
|
lowercaseNormalizer,
|
||||||
nfkcNormalizer,
|
nfkcNormalizer,
|
||||||
sequenceNormalizer
|
sequenceNormalizer,
|
||||||
} from "../../bindings/normalizers";
|
} from "../../bindings/normalizers";
|
||||||
import { whitespaceSplitPreTokenizer } from "../../bindings/pre-tokenizers";
|
import { whitespaceSplitPreTokenizer } from "../../bindings/pre-tokenizers";
|
||||||
import { Tokenizer } from "../../bindings/tokenizer";
|
import { Tokenizer } from "../../bindings/tokenizer";
|
||||||
@ -75,7 +75,7 @@ type BPETokenizerConfig = BPETokenizerOptions &
|
|||||||
export class BPETokenizer extends BaseTokenizer<BPETokenizerConfig> {
|
export class BPETokenizer extends BaseTokenizer<BPETokenizerConfig> {
|
||||||
private static readonly defaultBPEOptions: BPETokenizerConfig = {
|
private static readonly defaultBPEOptions: BPETokenizerConfig = {
|
||||||
suffix: "</w>",
|
suffix: "</w>",
|
||||||
unkToken: "<unk>"
|
unkToken: "<unk>",
|
||||||
};
|
};
|
||||||
|
|
||||||
private readonly defaultTrainOptions: Required<BPETokenizerTrainOptions> = {
|
private readonly defaultTrainOptions: Required<BPETokenizerTrainOptions> = {
|
||||||
@ -85,7 +85,7 @@ export class BPETokenizer extends BaseTokenizer<BPETokenizerConfig> {
|
|||||||
showProgress: true,
|
showProgress: true,
|
||||||
specialTokens: ["<unk>"],
|
specialTokens: ["<unk>"],
|
||||||
suffix: "</w>",
|
suffix: "</w>",
|
||||||
vocabSize: 30000
|
vocabSize: 30000,
|
||||||
};
|
};
|
||||||
|
|
||||||
private constructor(tokenizer: Tokenizer, configuration: BPETokenizerConfig) {
|
private constructor(tokenizer: Tokenizer, configuration: BPETokenizerConfig) {
|
||||||
@ -105,7 +105,7 @@ export class BPETokenizer extends BaseTokenizer<BPETokenizerConfig> {
|
|||||||
const modelOptions: BPEOptions = {
|
const modelOptions: BPEOptions = {
|
||||||
dropout: opts.dropout,
|
dropout: opts.dropout,
|
||||||
endOfWordSuffix: opts.suffix,
|
endOfWordSuffix: opts.suffix,
|
||||||
unkToken: unkToken
|
unkToken: unkToken,
|
||||||
};
|
};
|
||||||
|
|
||||||
const fromFiles = promisify<string, string, BPEOptions, Model>(BPE.fromFiles);
|
const fromFiles = promisify<string, string, BPEOptions, Model>(BPE.fromFiles);
|
||||||
|
@ -5,7 +5,7 @@ import { BPE, BPEOptions, Model } from "../../bindings/models";
|
|||||||
import {
|
import {
|
||||||
lowercaseNormalizer,
|
lowercaseNormalizer,
|
||||||
nfkcNormalizer,
|
nfkcNormalizer,
|
||||||
sequenceNormalizer
|
sequenceNormalizer,
|
||||||
} from "../../bindings/normalizers";
|
} from "../../bindings/normalizers";
|
||||||
import { byteLevelProcessing } from "../../bindings/post-processors";
|
import { byteLevelProcessing } from "../../bindings/post-processors";
|
||||||
import { byteLevelAlphabet, byteLevelPreTokenizer } from "../../bindings/pre-tokenizers";
|
import { byteLevelAlphabet, byteLevelPreTokenizer } from "../../bindings/pre-tokenizers";
|
||||||
@ -72,14 +72,14 @@ type ByteLevelBPETokenizerConfig = ByteLevelBPETokenizerOptions &
|
|||||||
export class ByteLevelBPETokenizer extends BaseTokenizer<ByteLevelBPETokenizerConfig> {
|
export class ByteLevelBPETokenizer extends BaseTokenizer<ByteLevelBPETokenizerConfig> {
|
||||||
private static readonly defaultOptions: ByteLevelBPETokenizerConfig = {
|
private static readonly defaultOptions: ByteLevelBPETokenizerConfig = {
|
||||||
addPrefixSpace: false,
|
addPrefixSpace: false,
|
||||||
trimOffsets: false
|
trimOffsets: false,
|
||||||
};
|
};
|
||||||
|
|
||||||
private readonly defaultTrainOptions: Required<ByteLevelBPETrainOptions> = {
|
private readonly defaultTrainOptions: Required<ByteLevelBPETrainOptions> = {
|
||||||
minFrequency: 2,
|
minFrequency: 2,
|
||||||
showProgress: true,
|
showProgress: true,
|
||||||
specialTokens: ["<unk>"],
|
specialTokens: ["<unk>"],
|
||||||
vocabSize: 30000
|
vocabSize: 30000,
|
||||||
};
|
};
|
||||||
|
|
||||||
private constructor(tokenizer: Tokenizer, configuration: ByteLevelBPETokenizerConfig) {
|
private constructor(tokenizer: Tokenizer, configuration: ByteLevelBPETokenizerConfig) {
|
||||||
@ -127,7 +127,7 @@ export class ByteLevelBPETokenizer extends BaseTokenizer<ByteLevelBPETokenizerCo
|
|||||||
const mergedOptions = { ...this.defaultTrainOptions, ...options };
|
const mergedOptions = { ...this.defaultTrainOptions, ...options };
|
||||||
const trainer = bpeTrainer({
|
const trainer = bpeTrainer({
|
||||||
...mergedOptions,
|
...mergedOptions,
|
||||||
initialAlphabet: byteLevelAlphabet()
|
initialAlphabet: byteLevelAlphabet(),
|
||||||
});
|
});
|
||||||
|
|
||||||
this.tokenizer.train(trainer, files);
|
this.tokenizer.train(trainer, files);
|
||||||
|
@ -68,7 +68,7 @@ export class SentencePieceBPETokenizer extends BaseTokenizer<
|
|||||||
private static readonly defaultOptions: SentencePieceBPETokenizerConfig = {
|
private static readonly defaultOptions: SentencePieceBPETokenizerConfig = {
|
||||||
addPrefixSpace: true,
|
addPrefixSpace: true,
|
||||||
replacement: "▁",
|
replacement: "▁",
|
||||||
unkToken: "<unk>"
|
unkToken: "<unk>",
|
||||||
};
|
};
|
||||||
|
|
||||||
private readonly defaultTrainOptions: Required<SentencePieceBPETrainOptions> = {
|
private readonly defaultTrainOptions: Required<SentencePieceBPETrainOptions> = {
|
||||||
@ -77,7 +77,7 @@ export class SentencePieceBPETokenizer extends BaseTokenizer<
|
|||||||
minFrequency: 2,
|
minFrequency: 2,
|
||||||
showProgress: true,
|
showProgress: true,
|
||||||
specialTokens: ["<unk>"],
|
specialTokens: ["<unk>"],
|
||||||
vocabSize: 30000
|
vocabSize: 30000,
|
||||||
};
|
};
|
||||||
|
|
||||||
private constructor(
|
private constructor(
|
||||||
@ -97,7 +97,7 @@ export class SentencePieceBPETokenizer extends BaseTokenizer<
|
|||||||
if (opts.vocabFile && opts.mergesFile) {
|
if (opts.vocabFile && opts.mergesFile) {
|
||||||
const modelOptions: BPEOptions = {
|
const modelOptions: BPEOptions = {
|
||||||
dropout: opts.dropout,
|
dropout: opts.dropout,
|
||||||
unkToken: unkToken
|
unkToken: unkToken,
|
||||||
};
|
};
|
||||||
|
|
||||||
const fromFiles = promisify<string, string, BPEOptions, Model>(BPE.fromFiles);
|
const fromFiles = promisify<string, string, BPEOptions, Model>(BPE.fromFiles);
|
||||||
|
@ -11,6 +11,6 @@ export {
|
|||||||
EncodeInput,
|
EncodeInput,
|
||||||
EncodeOptions,
|
EncodeOptions,
|
||||||
TruncationConfiguration,
|
TruncationConfiguration,
|
||||||
TruncationOptions
|
TruncationOptions,
|
||||||
} from "./bindings/tokenizer";
|
} from "./bindings/tokenizer";
|
||||||
export { Encoding } from "./implementations/encoding";
|
export { Encoding } from "./implementations/encoding";
|
||||||
|
Reference in New Issue
Block a user