mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Add tests for from_pretrained
This commit is contained in:
16
bindings/node/lib/bindings/tokenizer.d.ts
vendored
16
bindings/node/lib/bindings/tokenizer.d.ts
vendored
@ -7,6 +7,19 @@ import { PreTokenizer } from "./pre-tokenizers";
|
|||||||
import { RawEncoding } from "./raw-encoding";
|
import { RawEncoding } from "./raw-encoding";
|
||||||
import { Trainer } from "./trainers";
|
import { Trainer } from "./trainers";
|
||||||
|
|
||||||
|
export interface FromPretrainedOptions {
|
||||||
|
/**
|
||||||
|
* The revision to download
|
||||||
|
* @default "main"
|
||||||
|
*/
|
||||||
|
revision?: string;
|
||||||
|
/**
|
||||||
|
* The auth token to use to access private repositories on the Hugging Face Hub
|
||||||
|
* @default undefined
|
||||||
|
*/
|
||||||
|
authToken?: string;
|
||||||
|
}
|
||||||
|
|
||||||
export interface TruncationOptions {
|
export interface TruncationOptions {
|
||||||
/**
|
/**
|
||||||
* The length of the previous sequence to be included in the overflowing sequence
|
* The length of the previous sequence to be included in the overflowing sequence
|
||||||
@ -128,8 +141,9 @@ export class Tokenizer {
|
|||||||
* Hugging Face Hub. Any model repo containing a `tokenizer.json`
|
* Hugging Face Hub. Any model repo containing a `tokenizer.json`
|
||||||
* can be used here.
|
* can be used here.
|
||||||
* @param identifier A model identifier on the Hub
|
* @param identifier A model identifier on the Hub
|
||||||
|
* @param options Additional options
|
||||||
*/
|
*/
|
||||||
static fromPretrained(s: string): Tokenizer;
|
static fromPretrained(s: string, options?: FromPretrainedOptions): Tokenizer;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Add the given tokens to the vocabulary
|
* Add the given tokens to the vocabulary
|
||||||
|
@ -95,6 +95,33 @@ describe("Tokenizer", () => {
|
|||||||
expect(typeof tokenizer.train).toBe("function");
|
expect(typeof tokenizer.train).toBe("function");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("can be instantiated from the hub", async () => {
|
||||||
|
let tokenizer: Tokenizer;
|
||||||
|
let encode: (
|
||||||
|
sequence: InputSequence,
|
||||||
|
pair?: InputSequence | null,
|
||||||
|
options?: EncodeOptions | null
|
||||||
|
) => Promise<RawEncoding>;
|
||||||
|
let output: RawEncoding;
|
||||||
|
|
||||||
|
tokenizer = Tokenizer.fromPretrained("bert-base-cased");
|
||||||
|
encode = promisify(tokenizer.encode.bind(tokenizer));
|
||||||
|
output = await encode("Hey there dear friend!", null, { addSpecialTokens: false });
|
||||||
|
expect(output.getTokens()).toEqual(["Hey", "there", "dear", "friend", "!"]);
|
||||||
|
|
||||||
|
tokenizer = Tokenizer.fromPretrained("anthony/tokenizers-test");
|
||||||
|
encode = promisify(tokenizer.encode.bind(tokenizer));
|
||||||
|
output = await encode("Hey there dear friend!", null, { addSpecialTokens: false });
|
||||||
|
expect(output.getTokens()).toEqual(["hey", "there", "dear", "friend", "!"]);
|
||||||
|
|
||||||
|
tokenizer = Tokenizer.fromPretrained("anthony/tokenizers-test", {
|
||||||
|
revision: "gpt-2",
|
||||||
|
});
|
||||||
|
encode = promisify(tokenizer.encode.bind(tokenizer));
|
||||||
|
output = await encode("Hey there dear friend!", null, { addSpecialTokens: false });
|
||||||
|
expect(output.getTokens()).toEqual(["Hey", "Ġthere", "Ġdear", "Ġfriend", "!"]);
|
||||||
|
});
|
||||||
|
|
||||||
describe("addTokens", () => {
|
describe("addTokens", () => {
|
||||||
it("accepts a list of string as new tokens when initial model is empty", () => {
|
it("accepts a list of string as new tokens when initial model is empty", () => {
|
||||||
const model = BPE.empty();
|
const model = BPE.empty();
|
||||||
|
@ -392,3 +392,17 @@ class TestTokenizer:
|
|||||||
tokenizer = Tokenizer(BPE())
|
tokenizer = Tokenizer(BPE())
|
||||||
multiprocessing_with_parallelism(tokenizer, False)
|
multiprocessing_with_parallelism(tokenizer, False)
|
||||||
multiprocessing_with_parallelism(tokenizer, True)
|
multiprocessing_with_parallelism(tokenizer, True)
|
||||||
|
|
||||||
|
def test_from_pretrained(self):
|
||||||
|
tokenizer = Tokenizer.from_pretrained("bert-base-cased")
|
||||||
|
output = tokenizer.encode("Hey there dear friend!", add_special_tokens=False)
|
||||||
|
assert output.tokens == ["Hey", "there", "dear", "friend", "!"]
|
||||||
|
|
||||||
|
def test_from_pretrained_revision(self):
|
||||||
|
tokenizer = Tokenizer.from_pretrained("anthony/tokenizers-test")
|
||||||
|
output = tokenizer.encode("Hey there dear friend!", add_special_tokens=False)
|
||||||
|
assert output.tokens == ["hey", "there", "dear", "friend", "!"]
|
||||||
|
|
||||||
|
tokenizer = Tokenizer.from_pretrained("anthony/tokenizers-test", revision="gpt-2")
|
||||||
|
output = tokenizer.encode("Hey there dear friend!", add_special_tokens=False)
|
||||||
|
assert output.tokens == ["Hey", "Ġthere", "Ġdear", "Ġfriend", "!"]
|
||||||
|
37
tokenizers/tests/from_pretrained.rs
Normal file
37
tokenizers/tests/from_pretrained.rs
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
use tokenizers::{FromPretrainedParameters, Result, Tokenizer};
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_from_pretrained() -> Result<()> {
|
||||||
|
let tokenizer = Tokenizer::from_pretrained("bert-base-cased", None)?;
|
||||||
|
let encoding = tokenizer.encode("Hey there dear friend!", false)?;
|
||||||
|
assert_eq!(
|
||||||
|
encoding.get_tokens(),
|
||||||
|
&["Hey", "there", "dear", "friend", "!"]
|
||||||
|
);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_from_pretrained_revision() -> Result<()> {
|
||||||
|
let tokenizer = Tokenizer::from_pretrained("anthony/tokenizers-test", None)?;
|
||||||
|
let encoding = tokenizer.encode("Hey there dear friend!", false)?;
|
||||||
|
assert_eq!(
|
||||||
|
encoding.get_tokens(),
|
||||||
|
&["hey", "there", "dear", "friend", "!"]
|
||||||
|
);
|
||||||
|
|
||||||
|
let tokenizer = Tokenizer::from_pretrained(
|
||||||
|
"anthony/tokenizers-test",
|
||||||
|
Some(FromPretrainedParameters {
|
||||||
|
revision: "gpt-2".to_string(),
|
||||||
|
..Default::default()
|
||||||
|
}),
|
||||||
|
)?;
|
||||||
|
let encoding = tokenizer.encode("Hey there dear friend!", false)?;
|
||||||
|
assert_eq!(
|
||||||
|
encoding.get_tokens(),
|
||||||
|
&["Hey", "Ġthere", "Ġdear", "Ġfriend", "!"]
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
Reference in New Issue
Block a user