mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
@ -2,7 +2,7 @@
|
|||||||
authors = ["Nicolas Patry <nicolas@huggingface.co>"]
|
authors = ["Nicolas Patry <nicolas@huggingface.co>"]
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
name = "node"
|
name = "node"
|
||||||
version = "0.14.1-dev.0"
|
version = "0.14.2-dev.0"
|
||||||
|
|
||||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
|
7
bindings/node/index.d.ts
vendored
7
bindings/node/index.d.ts
vendored
@ -175,7 +175,7 @@ export class Encoding {
|
|||||||
getSequenceIds(): Array<number | undefined | null>
|
getSequenceIds(): Array<number | undefined | null>
|
||||||
tokenToSequence(token: number): number | null
|
tokenToSequence(token: number): number | null
|
||||||
}
|
}
|
||||||
export class Model { }
|
export class Model {}
|
||||||
export type Bpe = BPE
|
export type Bpe = BPE
|
||||||
export class BPE {
|
export class BPE {
|
||||||
static empty(): Model
|
static empty(): Model
|
||||||
@ -204,7 +204,7 @@ export class Normalizer {
|
|||||||
export class PreTokenizer {
|
export class PreTokenizer {
|
||||||
preTokenizeString(sequence: string): [string, [number, number]][]
|
preTokenizeString(sequence: string): [string, [number, number]][]
|
||||||
}
|
}
|
||||||
export class Processor { }
|
export class Processor {}
|
||||||
export class AddedToken {
|
export class AddedToken {
|
||||||
constructor(token: string, isSpecial: boolean, options?: AddedTokenOptions | undefined | null)
|
constructor(token: string, isSpecial: boolean, options?: AddedTokenOptions | undefined | null)
|
||||||
getContent(): string
|
getContent(): string
|
||||||
@ -229,7 +229,6 @@ export class Tokenizer {
|
|||||||
decodeBatch(ids: Array<Array<number>>, skipSpecialTokens: boolean): Promise<string[]>
|
decodeBatch(ids: Array<Array<number>>, skipSpecialTokens: boolean): Promise<string[]>
|
||||||
static fromString(s: string): Tokenizer
|
static fromString(s: string): Tokenizer
|
||||||
static fromFile(file: string): Tokenizer
|
static fromFile(file: string): Tokenizer
|
||||||
// static fromPretrained(file: string, parameters?: JsFromPretrainedParameters | undefined | null): Tokenizer
|
|
||||||
addSpecialTokens(tokens: Array<string>): void
|
addSpecialTokens(tokens: Array<string>): void
|
||||||
setTruncation(maxLength: number, options?: TruncationOptions | undefined | null): void
|
setTruncation(maxLength: number, options?: TruncationOptions | undefined | null): void
|
||||||
disableTruncation(): void
|
disableTruncation(): void
|
||||||
@ -251,4 +250,4 @@ export class Tokenizer {
|
|||||||
addSpecialTokens?: boolean | undefined | null,
|
addSpecialTokens?: boolean | undefined | null,
|
||||||
): Encoding
|
): Encoding
|
||||||
}
|
}
|
||||||
export class Trainer { }
|
export class Trainer {}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "tokenizers-python"
|
name = "tokenizers-python"
|
||||||
version = "0.14.1-dev.0"
|
version = "0.14.2-dev.0"
|
||||||
authors = ["Anthony MOI <m.anthony.moi@gmail.com>"]
|
authors = ["Anthony MOI <m.anthony.moi@gmail.com>"]
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
|
|
||||||
@ -21,7 +21,7 @@ onig = { version = "6.4", default-features = false }
|
|||||||
itertools = "0.11"
|
itertools = "0.11"
|
||||||
|
|
||||||
[dependencies.tokenizers]
|
[dependencies.tokenizers]
|
||||||
version = "0.14.1-dev.0"
|
version = "0.14.2-dev.0"
|
||||||
path = "../../tokenizers"
|
path = "../../tokenizers"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
authors = ["Anthony MOI <m.anthony.moi@gmail.com>", "Nicolas Patry <patry.nicolas@protonmail.com>"]
|
authors = ["Anthony MOI <m.anthony.moi@gmail.com>", "Nicolas Patry <patry.nicolas@protonmail.com>"]
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
name = "tokenizers"
|
name = "tokenizers"
|
||||||
version = "0.14.1-dev.0"
|
version = "0.14.2-dev.0"
|
||||||
homepage = "https://github.com/huggingface/tokenizers"
|
homepage = "https://github.com/huggingface/tokenizers"
|
||||||
repository = "https://github.com/huggingface/tokenizers"
|
repository = "https://github.com/huggingface/tokenizers"
|
||||||
documentation = "https://docs.rs/tokenizers/"
|
documentation = "https://docs.rs/tokenizers/"
|
||||||
|
@ -21,17 +21,17 @@ impl PartialEq for Merge {
|
|||||||
}
|
}
|
||||||
impl PartialOrd for Merge {
|
impl PartialOrd for Merge {
|
||||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||||
if self.count != other.count {
|
Some(self.cmp(other))
|
||||||
Some(self.count.cmp(&other.count))
|
|
||||||
} else {
|
|
||||||
// Here we want ascending order
|
|
||||||
Some(other.pair.cmp(&self.pair))
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
impl Ord for Merge {
|
impl Ord for Merge {
|
||||||
fn cmp(&self, other: &Self) -> Ordering {
|
fn cmp(&self, other: &Self) -> Ordering {
|
||||||
self.partial_cmp(other).unwrap()
|
if self.count != other.count {
|
||||||
|
self.count.cmp(&other.count)
|
||||||
|
} else {
|
||||||
|
// Here we want ascending order
|
||||||
|
other.pair.cmp(&self.pair)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -533,15 +533,16 @@ impl BpeTrainer {
|
|||||||
let changes = top
|
let changes = top
|
||||||
.pos
|
.pos
|
||||||
.maybe_par_iter()
|
.maybe_par_iter()
|
||||||
.flat_map(|i| {
|
.flat_map(|&i| {
|
||||||
let w = &words[*i] as *const _ as *mut _;
|
let word = &words[i] as *const _ as *mut Word;
|
||||||
// We can merge each of these words in parallel here because each position
|
// We can merge each of these words in parallel here because each position
|
||||||
// can be there only once (HashSet). So this is safe.
|
// can be there only once (HashSet). So this is safe.
|
||||||
unsafe {
|
unsafe {
|
||||||
let word: &mut Word = &mut (*w);
|
// let word: &mut Word = &mut (*word);
|
||||||
word.merge(top.pair.0, top.pair.1, new_token_id, max_token_length)
|
(*word)
|
||||||
|
.merge(top.pair.0, top.pair.1, new_token_id, max_token_length)
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|c| (c, *i))
|
.map(|c| (c, i))
|
||||||
.collect::<Vec<_>>()
|
.collect::<Vec<_>>()
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
@ -20,17 +20,17 @@ impl PartialOrd for Merge {
|
|||||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||||
// By manually implementing this, we make the containing BinaryHeap a
|
// By manually implementing this, we make the containing BinaryHeap a
|
||||||
// min-heap ordered first on the rank, and the pos otherwise
|
// min-heap ordered first on the rank, and the pos otherwise
|
||||||
if self.rank != other.rank {
|
Some(self.cmp(other))
|
||||||
Some(other.rank.cmp(&self.rank))
|
|
||||||
} else {
|
|
||||||
Some(other.pos.cmp(&self.pos))
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Ord for Merge {
|
impl Ord for Merge {
|
||||||
fn cmp(&self, other: &Self) -> Ordering {
|
fn cmp(&self, other: &Self) -> Ordering {
|
||||||
self.partial_cmp(other).unwrap()
|
if self.rank != other.rank {
|
||||||
|
other.rank.cmp(&self.rank)
|
||||||
|
} else {
|
||||||
|
other.pos.cmp(&self.pos)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -25,7 +25,7 @@ impl<Label: Eq + Hash + Copy> Trie<Label> {
|
|||||||
pub fn push(&mut self, element: &[Label]) {
|
pub fn push(&mut self, element: &[Label]) {
|
||||||
let mut node = &mut self.root;
|
let mut node = &mut self.root;
|
||||||
for label in element.iter() {
|
for label in element.iter() {
|
||||||
node = node.children.entry(*label).or_insert_with(Node::default);
|
node = node.children.entry(*label).or_default();
|
||||||
}
|
}
|
||||||
node.is_leaf = true;
|
node.is_leaf = true;
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user