mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Merge pull request #92 from huggingface/fix-bpe-tokenizer
Fix BPETokenizer
This commit is contained in:
@ -29,6 +29,13 @@ export function byteLevelAlphabet(): string[];
|
|||||||
*/
|
*/
|
||||||
export function whitespacePreTokenizer(): PreTokenizer;
|
export function whitespacePreTokenizer(): PreTokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a WhitespaceSplit PreTokenizer
|
||||||
|
* This pre-tokenizer simply splits on whitespaces only. Works almost like the `.split(' ')`
|
||||||
|
* function, except that it accounts for multiple consecutive spaces
|
||||||
|
*/
|
||||||
|
export function whitespaceSplitPreTokenizer(): PreTokenizer;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a new Bert PreTokenizer.
|
* Returns a new Bert PreTokenizer.
|
||||||
* This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
* This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
||||||
|
@ -1,9 +1,10 @@
|
|||||||
var native = require('./native');
|
var native = require('./native');
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
byteLevelPreTokenizer: native.pre_tokenizers_ByteLevel,
|
byteLevelPreTokenizer: native.pre_tokenizers_ByteLevel,
|
||||||
byteLevelAlphabet: native.pre_tokenizers_ByteLevel_Alphabet,
|
byteLevelAlphabet: native.pre_tokenizers_ByteLevel_Alphabet,
|
||||||
whitespacePreTokenizer: native.pre_tokenizers_Whitespace,
|
whitespacePreTokenizer: native.pre_tokenizers_Whitespace,
|
||||||
bertPreTokenizer: native.pre_tokenizers_BertPreTokenizer,
|
whitespaceSplitPreTokenizer: native.pre_tokenizers_WhitespaceSplit,
|
||||||
metaspacePreTokenizer: native.pre_tokenizers_Metaspace
|
bertPreTokenizer: native.pre_tokenizers_BertPreTokenizer,
|
||||||
|
metaspacePreTokenizer: native.pre_tokenizers_Metaspace
|
||||||
};
|
};
|
||||||
|
@ -3,7 +3,7 @@ import { BaseTokenizer } from "./base.tokenizer";
|
|||||||
import { Model, bpe } from "../bindings/models";
|
import { Model, bpe } from "../bindings/models";
|
||||||
import { Tokenizer } from "../bindings/tokenizer";
|
import { Tokenizer } from "../bindings/tokenizer";
|
||||||
import { sequenceNormalizer, nfkcNormalizer, lowercaseNormalizer } from "../bindings/normalizers";
|
import { sequenceNormalizer, nfkcNormalizer, lowercaseNormalizer } from "../bindings/normalizers";
|
||||||
import { whitespacePreTokenizer } from "../bindings/pre-tokenizers";
|
import { whitespaceSplitPreTokenizer } from "../bindings/pre-tokenizers";
|
||||||
import { bpeDecoder } from "../bindings/decoders";
|
import { bpeDecoder } from "../bindings/decoders";
|
||||||
import { bpeTrainer } from "../bindings/trainers";
|
import { bpeTrainer } from "../bindings/trainers";
|
||||||
|
|
||||||
@ -103,7 +103,7 @@ export class BPETokenizer extends BaseTokenizer {
|
|||||||
|
|
||||||
const normalizer = sequenceNormalizer([nfkcNormalizer(), lowercaseNormalizer()]);
|
const normalizer = sequenceNormalizer([nfkcNormalizer(), lowercaseNormalizer()]);
|
||||||
tokenizer.setNormalizer(normalizer);
|
tokenizer.setNormalizer(normalizer);
|
||||||
tokenizer.setPreTokenizer(whitespacePreTokenizer());
|
tokenizer.setPreTokenizer(whitespaceSplitPreTokenizer());
|
||||||
|
|
||||||
const decoder = bpeDecoder(mergedOptions.suffix);
|
const decoder = bpeDecoder(mergedOptions.suffix);
|
||||||
tokenizer.setDecoder(decoder);
|
tokenizer.setDecoder(decoder);
|
||||||
|
@ -62,6 +62,17 @@ fn whitespace(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
|
|||||||
Ok(pretok)
|
Ok(pretok)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// whitespace_split()
|
||||||
|
fn whitespace_split(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
|
||||||
|
let mut pretok = JsPreTokenizer::new::<_, JsPreTokenizer, _>(&mut cx, vec![])?;
|
||||||
|
let guard = cx.lock();
|
||||||
|
pretok
|
||||||
|
.borrow_mut(&guard)
|
||||||
|
.pretok
|
||||||
|
.to_owned(Box::new(tk::pre_tokenizers::whitespace::WhitespaceSplit));
|
||||||
|
Ok(pretok)
|
||||||
|
}
|
||||||
|
|
||||||
/// bert_pre_tokenizer()
|
/// bert_pre_tokenizer()
|
||||||
fn bert_pre_tokenizer(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
|
fn bert_pre_tokenizer(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
|
||||||
let mut pretok = JsPreTokenizer::new::<_, JsPreTokenizer, _>(&mut cx, vec![])?;
|
let mut pretok = JsPreTokenizer::new::<_, JsPreTokenizer, _>(&mut cx, vec![])?;
|
||||||
@ -105,6 +116,7 @@ pub fn register(m: &mut ModuleContext, prefix: &str) -> NeonResult<()> {
|
|||||||
byte_level_alphabet,
|
byte_level_alphabet,
|
||||||
)?;
|
)?;
|
||||||
m.export_function(&format!("{}_Whitespace", prefix), whitespace)?;
|
m.export_function(&format!("{}_Whitespace", prefix), whitespace)?;
|
||||||
|
m.export_function(&format!("{}_WhitespaceSplit", prefix), whitespace_split)?;
|
||||||
m.export_function(&format!("{}_BertPreTokenizer", prefix), bert_pre_tokenizer)?;
|
m.export_function(&format!("{}_BertPreTokenizer", prefix), bert_pre_tokenizer)?;
|
||||||
m.export_function(&format!("{}_Metaspace", prefix), metaspace)?;
|
m.export_function(&format!("{}_Metaspace", prefix), metaspace)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -37,6 +37,7 @@ fn pre_tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
|||||||
m.add_class::<pre_tokenizers::PreTokenizer>()?;
|
m.add_class::<pre_tokenizers::PreTokenizer>()?;
|
||||||
m.add_class::<pre_tokenizers::ByteLevel>()?;
|
m.add_class::<pre_tokenizers::ByteLevel>()?;
|
||||||
m.add_class::<pre_tokenizers::Whitespace>()?;
|
m.add_class::<pre_tokenizers::Whitespace>()?;
|
||||||
|
m.add_class::<pre_tokenizers::WhitespaceSplit>()?;
|
||||||
m.add_class::<pre_tokenizers::BertPreTokenizer>()?;
|
m.add_class::<pre_tokenizers::BertPreTokenizer>()?;
|
||||||
m.add_class::<pre_tokenizers::Metaspace>()?;
|
m.add_class::<pre_tokenizers::Metaspace>()?;
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -73,6 +73,18 @@ impl Whitespace {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[pyclass]
|
||||||
|
pub struct WhitespaceSplit {}
|
||||||
|
#[pymethods]
|
||||||
|
impl WhitespaceSplit {
|
||||||
|
#[staticmethod]
|
||||||
|
fn new() -> PyResult<PreTokenizer> {
|
||||||
|
Ok(PreTokenizer {
|
||||||
|
pretok: Container::Owned(Box::new(tk::pre_tokenizers::whitespace::WhitespaceSplit)),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[pyclass]
|
#[pyclass]
|
||||||
pub struct BertPreTokenizer {}
|
pub struct BertPreTokenizer {}
|
||||||
#[pymethods]
|
#[pymethods]
|
||||||
|
@ -30,7 +30,7 @@ class BPETokenizer(BaseTokenizer):
|
|||||||
NFKC.new(),
|
NFKC.new(),
|
||||||
Lowercase.new()
|
Lowercase.new()
|
||||||
])
|
])
|
||||||
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace.new()
|
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit.new()
|
||||||
tokenizer.decoder = decoders.BPEDecoder.new(suffix=suffix)
|
tokenizer.decoder = decoders.BPEDecoder.new(suffix=suffix)
|
||||||
|
|
||||||
parameters = {
|
parameters = {
|
||||||
|
@ -3,5 +3,6 @@ from .. import pre_tokenizers
|
|||||||
PreTokenizer = pre_tokenizers.PreTokenizer
|
PreTokenizer = pre_tokenizers.PreTokenizer
|
||||||
ByteLevel = pre_tokenizers.ByteLevel
|
ByteLevel = pre_tokenizers.ByteLevel
|
||||||
Whitespace = pre_tokenizers.Whitespace
|
Whitespace = pre_tokenizers.Whitespace
|
||||||
|
WhitespaceSplit = pre_tokenizers.WhitespaceSplit
|
||||||
BertPreTokenizer = pre_tokenizers.BertPreTokenizer
|
BertPreTokenizer = pre_tokenizers.BertPreTokenizer
|
||||||
Metaspace = pre_tokenizers.Metaspace
|
Metaspace = pre_tokenizers.Metaspace
|
||||||
|
@ -55,6 +55,17 @@ class Whitespace:
|
|||||||
""" Instantiate a new Whitespace PreTokenizer """
|
""" Instantiate a new Whitespace PreTokenizer """
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
class WhitespaceSplit:
|
||||||
|
""" Whitespace PreTokenizer
|
||||||
|
|
||||||
|
This pre-tokenizer simply splits on the whitespace. Works like `.split()`
|
||||||
|
"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def new() -> PreTokenizer:
|
||||||
|
""" Instantiate a new WhitespaceSplit PreTokenizer """
|
||||||
|
pass
|
||||||
|
|
||||||
class BertPreTokenizer:
|
class BertPreTokenizer:
|
||||||
""" BertPreTokenizer
|
""" BertPreTokenizer
|
||||||
|
|
||||||
|
@ -26,9 +26,36 @@ impl PreTokenizer for Whitespace {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct WhitespaceSplit;
|
||||||
|
impl PreTokenizer for WhitespaceSplit {
|
||||||
|
fn pre_tokenize(&self, s: &str) -> Result<Vec<(String, Offsets)>> {
|
||||||
|
let mut words = vec![];
|
||||||
|
let mut word = Vec::with_capacity(1000);
|
||||||
|
let mut offset = 0;
|
||||||
|
|
||||||
|
s.chars().for_each(|c| {
|
||||||
|
if c.is_whitespace() {
|
||||||
|
if !word.is_empty() {
|
||||||
|
let offsets = (offset - word.len(), offset);
|
||||||
|
words.push((word.drain(0..).collect::<String>(), offsets));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
word.push(c);
|
||||||
|
}
|
||||||
|
offset += 1;
|
||||||
|
});
|
||||||
|
if !word.is_empty() {
|
||||||
|
let offsets = (offset - word.len(), offset);
|
||||||
|
words.push((word.drain(0..).collect::<String>(), offsets));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(words)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::Whitespace;
|
use super::*;
|
||||||
use crate::tokenizer::PreTokenizer;
|
use crate::tokenizer::PreTokenizer;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@ -58,4 +85,26 @@ mod tests {
|
|||||||
assert_eq!(pretok.pre_tokenize(s).unwrap(), res);
|
assert_eq!(pretok.pre_tokenize(s).unwrap(), res);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn whitespace_split() {
|
||||||
|
let tests = vec![
|
||||||
|
(
|
||||||
|
"Hey man!",
|
||||||
|
vec![("Hey".into(), (0, 3)), ("man!".into(), (4, 8))],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Hey, man, Good?",
|
||||||
|
vec![
|
||||||
|
("Hey,".into(), (0, 4)),
|
||||||
|
("man,".into(), (5, 9)),
|
||||||
|
("Good?".into(), (10, 15)),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
];
|
||||||
|
let pretok = WhitespaceSplit;
|
||||||
|
for (s, res) in tests {
|
||||||
|
assert_eq!(pretok.pre_tokenize(s).unwrap(), res);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user