mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Merge pull request #92 from huggingface/fix-bpe-tokenizer
Fix BPETokenizer
This commit is contained in:
@ -29,6 +29,13 @@ export function byteLevelAlphabet(): string[];
|
||||
*/
|
||||
export function whitespacePreTokenizer(): PreTokenizer;
|
||||
|
||||
/**
|
||||
* Returns a WhitespaceSplit PreTokenizer
|
||||
* This pre-tokenizer simply splits on whitespaces only. Works almost like the `.split(' ')`
|
||||
* function, except that it accounts for multiple consecutive spaces
|
||||
*/
|
||||
export function whitespaceSplitPreTokenizer(): PreTokenizer;
|
||||
|
||||
/**
|
||||
* Returns a new Bert PreTokenizer.
|
||||
* This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
||||
|
@ -1,9 +1,10 @@
|
||||
var native = require('./native');
|
||||
|
||||
module.exports = {
|
||||
byteLevelPreTokenizer: native.pre_tokenizers_ByteLevel,
|
||||
byteLevelAlphabet: native.pre_tokenizers_ByteLevel_Alphabet,
|
||||
whitespacePreTokenizer: native.pre_tokenizers_Whitespace,
|
||||
bertPreTokenizer: native.pre_tokenizers_BertPreTokenizer,
|
||||
metaspacePreTokenizer: native.pre_tokenizers_Metaspace
|
||||
byteLevelPreTokenizer: native.pre_tokenizers_ByteLevel,
|
||||
byteLevelAlphabet: native.pre_tokenizers_ByteLevel_Alphabet,
|
||||
whitespacePreTokenizer: native.pre_tokenizers_Whitespace,
|
||||
whitespaceSplitPreTokenizer: native.pre_tokenizers_WhitespaceSplit,
|
||||
bertPreTokenizer: native.pre_tokenizers_BertPreTokenizer,
|
||||
metaspacePreTokenizer: native.pre_tokenizers_Metaspace
|
||||
};
|
||||
|
@ -3,7 +3,7 @@ import { BaseTokenizer } from "./base.tokenizer";
|
||||
import { Model, bpe } from "../bindings/models";
|
||||
import { Tokenizer } from "../bindings/tokenizer";
|
||||
import { sequenceNormalizer, nfkcNormalizer, lowercaseNormalizer } from "../bindings/normalizers";
|
||||
import { whitespacePreTokenizer } from "../bindings/pre-tokenizers";
|
||||
import { whitespaceSplitPreTokenizer } from "../bindings/pre-tokenizers";
|
||||
import { bpeDecoder } from "../bindings/decoders";
|
||||
import { bpeTrainer } from "../bindings/trainers";
|
||||
|
||||
@ -103,7 +103,7 @@ export class BPETokenizer extends BaseTokenizer {
|
||||
|
||||
const normalizer = sequenceNormalizer([nfkcNormalizer(), lowercaseNormalizer()]);
|
||||
tokenizer.setNormalizer(normalizer);
|
||||
tokenizer.setPreTokenizer(whitespacePreTokenizer());
|
||||
tokenizer.setPreTokenizer(whitespaceSplitPreTokenizer());
|
||||
|
||||
const decoder = bpeDecoder(mergedOptions.suffix);
|
||||
tokenizer.setDecoder(decoder);
|
||||
|
@ -62,6 +62,17 @@ fn whitespace(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
|
||||
Ok(pretok)
|
||||
}
|
||||
|
||||
/// whitespace_split()
|
||||
fn whitespace_split(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
|
||||
let mut pretok = JsPreTokenizer::new::<_, JsPreTokenizer, _>(&mut cx, vec![])?;
|
||||
let guard = cx.lock();
|
||||
pretok
|
||||
.borrow_mut(&guard)
|
||||
.pretok
|
||||
.to_owned(Box::new(tk::pre_tokenizers::whitespace::WhitespaceSplit));
|
||||
Ok(pretok)
|
||||
}
|
||||
|
||||
/// bert_pre_tokenizer()
|
||||
fn bert_pre_tokenizer(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
|
||||
let mut pretok = JsPreTokenizer::new::<_, JsPreTokenizer, _>(&mut cx, vec![])?;
|
||||
@ -105,6 +116,7 @@ pub fn register(m: &mut ModuleContext, prefix: &str) -> NeonResult<()> {
|
||||
byte_level_alphabet,
|
||||
)?;
|
||||
m.export_function(&format!("{}_Whitespace", prefix), whitespace)?;
|
||||
m.export_function(&format!("{}_WhitespaceSplit", prefix), whitespace_split)?;
|
||||
m.export_function(&format!("{}_BertPreTokenizer", prefix), bert_pre_tokenizer)?;
|
||||
m.export_function(&format!("{}_Metaspace", prefix), metaspace)?;
|
||||
Ok(())
|
||||
|
@ -37,6 +37,7 @@ fn pre_tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
m.add_class::<pre_tokenizers::PreTokenizer>()?;
|
||||
m.add_class::<pre_tokenizers::ByteLevel>()?;
|
||||
m.add_class::<pre_tokenizers::Whitespace>()?;
|
||||
m.add_class::<pre_tokenizers::WhitespaceSplit>()?;
|
||||
m.add_class::<pre_tokenizers::BertPreTokenizer>()?;
|
||||
m.add_class::<pre_tokenizers::Metaspace>()?;
|
||||
Ok(())
|
||||
|
@ -73,6 +73,18 @@ impl Whitespace {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
pub struct WhitespaceSplit {}
|
||||
#[pymethods]
|
||||
impl WhitespaceSplit {
|
||||
#[staticmethod]
|
||||
fn new() -> PyResult<PreTokenizer> {
|
||||
Ok(PreTokenizer {
|
||||
pretok: Container::Owned(Box::new(tk::pre_tokenizers::whitespace::WhitespaceSplit)),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
pub struct BertPreTokenizer {}
|
||||
#[pymethods]
|
||||
|
@ -30,7 +30,7 @@ class BPETokenizer(BaseTokenizer):
|
||||
NFKC.new(),
|
||||
Lowercase.new()
|
||||
])
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace.new()
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit.new()
|
||||
tokenizer.decoder = decoders.BPEDecoder.new(suffix=suffix)
|
||||
|
||||
parameters = {
|
||||
|
@ -3,5 +3,6 @@ from .. import pre_tokenizers
|
||||
PreTokenizer = pre_tokenizers.PreTokenizer
|
||||
ByteLevel = pre_tokenizers.ByteLevel
|
||||
Whitespace = pre_tokenizers.Whitespace
|
||||
WhitespaceSplit = pre_tokenizers.WhitespaceSplit
|
||||
BertPreTokenizer = pre_tokenizers.BertPreTokenizer
|
||||
Metaspace = pre_tokenizers.Metaspace
|
||||
|
@ -55,6 +55,17 @@ class Whitespace:
|
||||
""" Instantiate a new Whitespace PreTokenizer """
|
||||
pass
|
||||
|
||||
class WhitespaceSplit:
|
||||
""" Whitespace PreTokenizer
|
||||
|
||||
This pre-tokenizer simply splits on the whitespace. Works like `.split()`
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def new() -> PreTokenizer:
|
||||
""" Instantiate a new WhitespaceSplit PreTokenizer """
|
||||
pass
|
||||
|
||||
class BertPreTokenizer:
|
||||
""" BertPreTokenizer
|
||||
|
||||
|
@ -26,9 +26,36 @@ impl PreTokenizer for Whitespace {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct WhitespaceSplit;
|
||||
impl PreTokenizer for WhitespaceSplit {
|
||||
fn pre_tokenize(&self, s: &str) -> Result<Vec<(String, Offsets)>> {
|
||||
let mut words = vec![];
|
||||
let mut word = Vec::with_capacity(1000);
|
||||
let mut offset = 0;
|
||||
|
||||
s.chars().for_each(|c| {
|
||||
if c.is_whitespace() {
|
||||
if !word.is_empty() {
|
||||
let offsets = (offset - word.len(), offset);
|
||||
words.push((word.drain(0..).collect::<String>(), offsets));
|
||||
}
|
||||
} else {
|
||||
word.push(c);
|
||||
}
|
||||
offset += 1;
|
||||
});
|
||||
if !word.is_empty() {
|
||||
let offsets = (offset - word.len(), offset);
|
||||
words.push((word.drain(0..).collect::<String>(), offsets));
|
||||
}
|
||||
|
||||
Ok(words)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::Whitespace;
|
||||
use super::*;
|
||||
use crate::tokenizer::PreTokenizer;
|
||||
|
||||
#[test]
|
||||
@ -58,4 +85,26 @@ mod tests {
|
||||
assert_eq!(pretok.pre_tokenize(s).unwrap(), res);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn whitespace_split() {
|
||||
let tests = vec![
|
||||
(
|
||||
"Hey man!",
|
||||
vec![("Hey".into(), (0, 3)), ("man!".into(), (4, 8))],
|
||||
),
|
||||
(
|
||||
"Hey, man, Good?",
|
||||
vec![
|
||||
("Hey,".into(), (0, 4)),
|
||||
("man,".into(), (5, 9)),
|
||||
("Good?".into(), (10, 15)),
|
||||
],
|
||||
),
|
||||
];
|
||||
let pretok = WhitespaceSplit;
|
||||
for (s, res) in tests {
|
||||
assert_eq!(pretok.pre_tokenize(s).unwrap(), res);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user