mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
* Move to maturing mimicking move for `safetensors`. * Tmp. * Fix sdist. * Wat? * Clippy 1.72 * Remove if. * Conda sed. * Fix doc check workflow. * Moving to maturin AND removing http + openssl mess (smoothing transition moving to `huggingface_hub`) * Fix dep * Black. * New node bindings. * Fix docs + node cache ? * Yarn. * Working dir. * Extension module. * Put back interpreter. * Remove cache. * New attempt * Multi python. * Remove FromPretrained. * Remove traces of `fromPretrained`. * Drop 3.12 for windows? * Typo. * Put back the default feature for ignoring links during simple test. * Fix ? * x86_64 -> x64. * Remove warning for windows bindings. * Excluse aarch. * Include/exclude. * Put back workflows in correct states.
163 lines
5.0 KiB
TypeScript
163 lines
5.0 KiB
TypeScript
// import { promisify } from 'util'
|
|
|
|
import { BPE, Tokenizer, mergeEncodings, slice } from '../../'
|
|
|
|
describe('slice', () => {
|
|
const text = 'My name is John 👋'
|
|
const sliceText = slice.bind({}, text)
|
|
|
|
it('returns the full text when no params', () => {
|
|
const sliced = sliceText()
|
|
expect(sliced).toEqual(text)
|
|
})
|
|
|
|
it('accepts `undefined` as second parameter', () => {
|
|
const original = sliceText(undefined)
|
|
expect(original).toEqual(text)
|
|
})
|
|
|
|
it('accepts `undefined` as third parameter', () => {
|
|
const original = sliceText(0, undefined)
|
|
expect(original).toEqual(text)
|
|
})
|
|
|
|
it('throws an error when `begin` is out of range', () => {
|
|
expect(() => sliceText(1000)).toThrow()
|
|
})
|
|
|
|
it('returns slice starting at the specified index', () => {
|
|
const original = sliceText(3)
|
|
expect(original).toEqual('name is John 👋')
|
|
})
|
|
|
|
it('throws an error when `end` is out of range', () => {
|
|
expect(() => sliceText(0, 1000)).toThrow()
|
|
})
|
|
|
|
it('returns the text between the two specified indexes', () => {
|
|
const original = sliceText(3, 7)
|
|
expect(original).toEqual('name')
|
|
})
|
|
|
|
describe('with only a negative `begin`', () => {
|
|
it('returns the original string counting from the end when in the range', () => {
|
|
const original = sliceText(-1)
|
|
expect(original).toEqual('👋')
|
|
})
|
|
|
|
it('throws an error when out of range', () => {
|
|
expect(() => sliceText(-1000)).toThrow()
|
|
})
|
|
})
|
|
|
|
describe('with a positive `begin` and a negative `end`', () => {
|
|
it('returns correct slice when resulting range is valid', () => {
|
|
const original = sliceText(3, -7)
|
|
expect(original).toEqual('name is')
|
|
})
|
|
|
|
it('throws an error when resulting `end` index is lower than `begin`', () => {
|
|
expect(() => sliceText(7, -12)).toThrow()
|
|
})
|
|
|
|
it('throws an error when `begin` is out of range', () => {
|
|
expect(() => sliceText(1000, -12)).toThrow()
|
|
})
|
|
|
|
it('throws an error when resulting `end` index is out of range', () => {
|
|
expect(() => sliceText(7, -1000)).toThrow()
|
|
})
|
|
})
|
|
|
|
describe('with a negative `begin` and a positive `end`', () => {
|
|
it('returns correct slice when resulting range is valid', () => {
|
|
const original = sliceText(-9, 10)
|
|
expect(original).toEqual('is')
|
|
})
|
|
|
|
it('throws an error when resulting `begin` index is upper than `end`', () => {
|
|
expect(() => sliceText(-3, 5)).toThrow()
|
|
})
|
|
|
|
it('throws an error when `end` is out of range', () => {
|
|
expect(() => sliceText(-5, 1000)).toThrow()
|
|
})
|
|
|
|
it('throws an error when resulting `begin` index is out of range', () => {
|
|
expect(() => sliceText(-1000, 10)).toThrow()
|
|
})
|
|
})
|
|
|
|
describe('with negatives `begin` and `end`', () => {
|
|
it('returns correct slice when resulting range is valid', () => {
|
|
const original = sliceText(-9, -7)
|
|
expect(original).toEqual('is')
|
|
})
|
|
|
|
it('throws an error when resulting `end` index is lower than `begin`', () => {
|
|
expect(() => sliceText(-5, -10)).toThrow()
|
|
})
|
|
|
|
it('throws an error when resulting `begin` index is out of range', () => {
|
|
expect(() => sliceText(-1000, -10)).toThrow()
|
|
})
|
|
|
|
it('throws an error when resulting `end` index is out of range', () => {
|
|
expect(() => sliceText(-10, -1000)).toThrow()
|
|
})
|
|
})
|
|
})
|
|
|
|
describe('mergeEncodings', () => {
|
|
const model = BPE.empty()
|
|
const tokenizer = new Tokenizer(model)
|
|
tokenizer.addTokens(['my', 'name', 'is', 'john'])
|
|
|
|
it('accepts `undefined` as a second parameter', () => {
|
|
const encoding = mergeEncodings([], undefined)
|
|
expect(encoding.constructor.name).toEqual('Encoding')
|
|
})
|
|
|
|
it('returns correct result with `growingOffsets` not provided', async () => {
|
|
const firstEncoding = await tokenizer.encode('my name is', null)
|
|
const secondEncoding = await tokenizer.encode('john', null)
|
|
const encoding = mergeEncodings([firstEncoding, secondEncoding])
|
|
|
|
expect(encoding.getTokens()).toEqual(['my', 'name', 'is', 'john'])
|
|
expect(encoding.getOffsets()).toEqual([
|
|
[0, 2],
|
|
[3, 7],
|
|
[8, 10],
|
|
[0, 4],
|
|
])
|
|
})
|
|
|
|
it('returns correct result when `growingOffsets` is `false`', async () => {
|
|
const firstEncoding = await tokenizer.encode('my name is', null)
|
|
const secondEncoding = await tokenizer.encode('john', null)
|
|
const encoding = mergeEncodings([firstEncoding, secondEncoding], false)
|
|
|
|
expect(encoding.getTokens()).toEqual(['my', 'name', 'is', 'john'])
|
|
expect(encoding.getOffsets()).toEqual([
|
|
[0, 2],
|
|
[3, 7],
|
|
[8, 10],
|
|
[0, 4],
|
|
])
|
|
})
|
|
|
|
it('returns correct result when `growingOffsets` is `true`', async () => {
|
|
const firstEncoding = await tokenizer.encode('my name is', null)
|
|
const secondEncoding = await tokenizer.encode('john', null)
|
|
const encoding = mergeEncodings([firstEncoding, secondEncoding], true)
|
|
|
|
expect(encoding.getTokens()).toEqual(['my', 'name', 'is', 'john'])
|
|
expect(encoding.getOffsets()).toEqual([
|
|
[0, 2],
|
|
[3, 7],
|
|
[8, 10],
|
|
[10, 14],
|
|
])
|
|
})
|
|
})
|