mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
* Move to maturing mimicking move for `safetensors`. * Tmp. * Fix sdist. * Wat? * Clippy 1.72 * Remove if. * Conda sed. * Fix doc check workflow. * Moving to maturin AND removing http + openssl mess (smoothing transition moving to `huggingface_hub`) * Fix dep * Black. * New node bindings. * Fix docs + node cache ? * Yarn. * Working dir. * Extension module. * Put back interpreter. * Remove cache. * New attempt * Multi python. * Remove FromPretrained. * Remove traces of `fromPretrained`. * Drop 3.12 for windows? * Typo. * Put back the default feature for ignoring links during simple test. * Fix ? * x86_64 -> x64. * Remove warning for windows bindings. * Excluse aarch. * Include/exclude. * Put back workflows in correct states.
255 lines
9.6 KiB
TypeScript
255 lines
9.6 KiB
TypeScript
/* tslint:disable */
|
|
/* eslint-disable */
|
|
|
|
/* auto-generated by NAPI-RS */
|
|
|
|
export function bpeDecoder(suffix?: string | undefined | null): Decoder
|
|
export function byteFallbackDecoder(): Decoder
|
|
export function ctcDecoder(
|
|
padToken?: string = '<pad>',
|
|
wordDelimiterToken?: string | undefined | null,
|
|
cleanup?: boolean | undefined | null,
|
|
): Decoder
|
|
export function fuseDecoder(): Decoder
|
|
export function metaspaceDecoder(replacement?: string = '▁', addPrefixSpace?: bool = true): Decoder
|
|
export function replaceDecoder(pattern: string, content: string): Decoder
|
|
export function sequenceDecoder(decoders: Array<Decoder>): Decoder
|
|
export function stripDecoder(content: string, left: number, right: number): Decoder
|
|
export function wordPieceDecoder(prefix?: string = '##', cleanup?: bool = true): Decoder
|
|
export const enum TruncationDirection {
|
|
Left = 'Left',
|
|
Right = 'Right',
|
|
}
|
|
export const enum TruncationStrategy {
|
|
LongestFirst = 'LongestFirst',
|
|
OnlyFirst = 'OnlyFirst',
|
|
OnlySecond = 'OnlySecond',
|
|
}
|
|
export interface BpeOptions {
|
|
cacheCapacity?: number
|
|
dropout?: number
|
|
unkToken?: string
|
|
continuingSubwordPrefix?: string
|
|
endOfWordSuffix?: string
|
|
fuseUnk?: boolean
|
|
byteFallback?: boolean
|
|
}
|
|
export interface WordPieceOptions {
|
|
unkToken?: string
|
|
continuingSubwordPrefix?: string
|
|
maxInputCharsPerWord?: number
|
|
}
|
|
export interface WordLevelOptions {
|
|
unkToken?: string
|
|
}
|
|
export interface UnigramOptions {
|
|
unkId?: number
|
|
byteFallback?: boolean
|
|
}
|
|
export function prependNormalizer(prepend: string): Normalizer
|
|
export function stripAccentsNormalizer(): Normalizer
|
|
export interface BertNormalizerOptions {
|
|
cleanText?: boolean
|
|
handleChineseChars?: boolean
|
|
stripAccents?: boolean
|
|
lowercase?: boolean
|
|
}
|
|
/**
|
|
* bert_normalizer(options?: {
|
|
* cleanText?: bool = true,
|
|
* handleChineseChars?: bool = true,
|
|
* stripAccents?: bool = true,
|
|
* lowercase?: bool = true
|
|
* })
|
|
*/
|
|
export function bertNormalizer(options?: BertNormalizerOptions | undefined | null): Normalizer
|
|
export function nfdNormalizer(): Normalizer
|
|
export function nfkdNormalizer(): Normalizer
|
|
export function nfcNormalizer(): Normalizer
|
|
export function nfkcNormalizer(): Normalizer
|
|
export function stripNormalizer(left?: boolean | undefined | null, right?: boolean | undefined | null): Normalizer
|
|
export function sequenceNormalizer(normalizers: Array<Normalizer>): Normalizer
|
|
export function lowercase(): Normalizer
|
|
export function replace(pattern: string, content: string): Normalizer
|
|
export function nmt(): Normalizer
|
|
export function precompiled(bytes: Array<number>): Normalizer
|
|
export const enum JsSplitDelimiterBehavior {
|
|
Removed = 'Removed',
|
|
Isolated = 'Isolated',
|
|
MergedWithPrevious = 'MergedWithPrevious',
|
|
MergedWithNext = 'MergedWithNext',
|
|
Contiguous = 'Contiguous',
|
|
}
|
|
/** byte_level(addPrefixSpace: bool = true, useRegex: bool = true) */
|
|
export function byteLevelPreTokenizer(
|
|
addPrefixSpace?: boolean | undefined | null,
|
|
useRegex?: boolean | undefined | null,
|
|
): PreTokenizer
|
|
export function byteLevelAlphabet(): Array<string>
|
|
export function whitespacePreTokenizer(): PreTokenizer
|
|
export function whitespaceSplitPreTokenizer(): PreTokenizer
|
|
export function bertPreTokenizer(): PreTokenizer
|
|
export function metaspacePreTokenizer(replacement?: string = '▁', addPrefixSpace?: bool = true): PreTokenizer
|
|
export function splitPreTokenizer(pattern: string, behavior: string, invert?: boolean | undefined | null): PreTokenizer
|
|
export function punctuationPreTokenizer(behavior?: string | undefined | null): PreTokenizer
|
|
export function sequencePreTokenizer(preTokenizers: Array<PreTokenizer>): PreTokenizer
|
|
export function charDelimiterSplit(delimiter: string): PreTokenizer
|
|
export function digitsPreTokenizer(individualDigits?: boolean | undefined | null): PreTokenizer
|
|
export function bertProcessing(sep: [string, number], cls: [string, number]): Processor
|
|
export function robertaProcessing(
|
|
sep: [string, number],
|
|
cls: [string, number],
|
|
trimOffsets?: boolean | undefined | null,
|
|
addPrefixSpace?: boolean | undefined | null,
|
|
): Processor
|
|
export function byteLevelProcessing(trimOffsets?: boolean | undefined | null): Processor
|
|
export function templateProcessing(
|
|
single: string,
|
|
pair?: string | undefined | null,
|
|
specialTokens?: Array<[string, number]> | undefined | null,
|
|
): Processor
|
|
export function sequenceProcessing(processors: Array<Processor>): Processor
|
|
export const enum PaddingDirection {
|
|
Left = 0,
|
|
Right = 1,
|
|
}
|
|
export interface PaddingOptions {
|
|
maxLength?: number
|
|
direction?: string | PaddingDirection
|
|
padToMultipleOf?: number
|
|
padId?: number
|
|
padTypeId?: number
|
|
padToken?: string
|
|
}
|
|
export interface EncodeOptions {
|
|
isPretokenized?: boolean
|
|
addSpecialTokens?: boolean
|
|
}
|
|
export interface TruncationOptions {
|
|
maxLength?: number
|
|
strategy?: TruncationStrategy
|
|
direction?: string | TruncationDirection
|
|
stride?: number
|
|
}
|
|
export interface AddedTokenOptions {
|
|
singleWord?: boolean
|
|
leftStrip?: boolean
|
|
rightStrip?: boolean
|
|
normalized?: boolean
|
|
}
|
|
export interface JsFromPretrainedParameters {
|
|
revision?: string
|
|
authToken?: string
|
|
}
|
|
export function slice(s: string, beginIndex?: number | undefined | null, endIndex?: number | undefined | null): string
|
|
export function mergeEncodings(encodings: Array<Encoding>, growingOffsets?: boolean | undefined | null): Encoding
|
|
/** Decoder */
|
|
export class Decoder {
|
|
decode(tokens: Array<string>): string
|
|
}
|
|
export type JsEncoding = Encoding
|
|
export class Encoding {
|
|
constructor()
|
|
getLength(): number
|
|
getNSequences(): number
|
|
getIds(): Array<number>
|
|
getTypeIds(): Array<number>
|
|
getAttentionMask(): Array<number>
|
|
getSpecialTokensMask(): Array<number>
|
|
getTokens(): Array<string>
|
|
getOffsets(): Array<Array<number>>
|
|
getWordIds(): Array<number | undefined | null>
|
|
charToToken(pos: number, seqId?: number | undefined | null): number | null
|
|
charToWord(pos: number, seqId?: number | undefined | null): number | null
|
|
pad(length: number, options?: PaddingOptions | undefined | null): void
|
|
truncate(
|
|
length: number,
|
|
stride?: number | undefined | null,
|
|
direction?: string | TruncationDirection | undefined | null,
|
|
): void
|
|
wordToTokens(word: number, seqId?: number | undefined | null): [number, number] | null | undefined
|
|
wordToChars(word: number, seqId?: number | undefined | null): [number, number] | null | undefined
|
|
tokenToChars(token: number): [number, [number, number]] | null | undefined
|
|
tokenToWord(token: number): number | null
|
|
getOverflowing(): Array<Encoding>
|
|
getSequenceIds(): Array<number | undefined | null>
|
|
tokenToSequence(token: number): number | null
|
|
}
|
|
export class Model { }
|
|
export type Bpe = BPE
|
|
export class BPE {
|
|
static empty(): Model
|
|
static init(vocab: Vocab, merges: Merges, options?: BpeOptions | undefined | null): Model
|
|
static fromFile(vocab: string, merges: string, options?: BpeOptions | undefined | null): Promise<Model>
|
|
}
|
|
export class WordPiece {
|
|
static init(vocab: Vocab, options?: WordPieceOptions | undefined | null): Model
|
|
static empty(): WordPiece
|
|
static fromFile(vocab: string, options?: WordPieceOptions | undefined | null): Promise<Model>
|
|
}
|
|
export class WordLevel {
|
|
static init(vocab: Vocab, options?: WordLevelOptions | undefined | null): Model
|
|
static empty(): WordLevel
|
|
static fromFile(vocab: string, options?: WordLevelOptions | undefined | null): Promise<Model>
|
|
}
|
|
export class Unigram {
|
|
static init(vocab: Array<[string, number]>, options?: UnigramOptions | undefined | null): Model
|
|
static empty(): Model
|
|
}
|
|
/** Normalizer */
|
|
export class Normalizer {
|
|
normalizeString(sequence: string): string
|
|
}
|
|
/** PreTokenizers */
|
|
export class PreTokenizer {
|
|
preTokenizeString(sequence: string): [string, [number, number]][]
|
|
}
|
|
export class Processor { }
|
|
export class AddedToken {
|
|
constructor(token: string, isSpecial: boolean, options?: AddedTokenOptions | undefined | null)
|
|
getContent(): string
|
|
}
|
|
export class Tokenizer {
|
|
constructor(model: Model)
|
|
setPreTokenizer(preTokenizer: PreTokenizer): void
|
|
setDecoder(decoder: Decoder): void
|
|
setModel(model: Model): void
|
|
setPostProcessor(postProcessor: Processor): void
|
|
setNormalizer(normalizer: Normalizer): void
|
|
save(path: string, pretty?: boolean | undefined | null): void
|
|
addAddedTokens(tokens: Array<AddedToken>): number
|
|
addTokens(tokens: Array<string>): number
|
|
encode(
|
|
sentence: InputSequence,
|
|
pair?: InputSequence | null,
|
|
encodeOptions?: EncodeOptions | undefined | null,
|
|
): Promise<JsEncoding>
|
|
encodeBatch(sentences: EncodeInput[], encodeOptions?: EncodeOptions | undefined | null): Promise<JsEncoding[]>
|
|
decode(ids: Array<number>, skipSpecialTokens: boolean): Promise<string>
|
|
decodeBatch(ids: Array<Array<number>>, skipSpecialTokens: boolean): Promise<string[]>
|
|
static fromString(s: string): Tokenizer
|
|
static fromFile(file: string): Tokenizer
|
|
// static fromPretrained(file: string, parameters?: JsFromPretrainedParameters | undefined | null): Tokenizer
|
|
addSpecialTokens(tokens: Array<string>): void
|
|
setTruncation(maxLength: number, options?: TruncationOptions | undefined | null): void
|
|
disableTruncation(): void
|
|
setPadding(options?: PaddingOptions | undefined | null): void
|
|
disablePadding(): void
|
|
getDecoder(): Decoder | null
|
|
getNormalizer(): Normalizer | null
|
|
getPreTokenizer(): PreTokenizer | null
|
|
getPostProcessor(): Processor | null
|
|
getVocab(withAddedTokens?: boolean | undefined | null): Record<string, number>
|
|
getVocabSize(withAddedTokens?: boolean | undefined | null): number
|
|
idToToken(id: number): string | null
|
|
tokenToId(token: string): number | null
|
|
train(files: Array<string>): void
|
|
runningTasks(): number
|
|
postProcess(
|
|
encoding: Encoding,
|
|
pair?: Encoding | undefined | null,
|
|
addSpecialTokens?: boolean | undefined | null,
|
|
): Encoding
|
|
}
|
|
export class Trainer { }
|