Merge pull request #92 from huggingface/fix-bpe-tokenizer

Fix BPETokenizer
This commit is contained in:
MOI Anthony
2020-01-20 09:01:02 -05:00
committed by GitHub
10 changed files with 103 additions and 9 deletions

View File

@ -29,6 +29,13 @@ export function byteLevelAlphabet(): string[];
*/
export function whitespacePreTokenizer(): PreTokenizer;
/**
* Returns a WhitespaceSplit PreTokenizer
* This pre-tokenizer simply splits on whitespaces only. Works almost like the `.split(' ')`
* function, except that it accounts for multiple consecutive spaces
*/
export function whitespaceSplitPreTokenizer(): PreTokenizer;
/**
* Returns a new Bert PreTokenizer.
* This pre-tokenizer splits tokens on spaces, and also on punctuation.

View File

@ -1,9 +1,10 @@
var native = require('./native');
module.exports = {
byteLevelPreTokenizer: native.pre_tokenizers_ByteLevel,
byteLevelAlphabet: native.pre_tokenizers_ByteLevel_Alphabet,
whitespacePreTokenizer: native.pre_tokenizers_Whitespace,
bertPreTokenizer: native.pre_tokenizers_BertPreTokenizer,
metaspacePreTokenizer: native.pre_tokenizers_Metaspace
byteLevelPreTokenizer: native.pre_tokenizers_ByteLevel,
byteLevelAlphabet: native.pre_tokenizers_ByteLevel_Alphabet,
whitespacePreTokenizer: native.pre_tokenizers_Whitespace,
whitespaceSplitPreTokenizer: native.pre_tokenizers_WhitespaceSplit,
bertPreTokenizer: native.pre_tokenizers_BertPreTokenizer,
metaspacePreTokenizer: native.pre_tokenizers_Metaspace
};

View File

@ -3,7 +3,7 @@ import { BaseTokenizer } from "./base.tokenizer";
import { Model, bpe } from "../bindings/models";
import { Tokenizer } from "../bindings/tokenizer";
import { sequenceNormalizer, nfkcNormalizer, lowercaseNormalizer } from "../bindings/normalizers";
import { whitespacePreTokenizer } from "../bindings/pre-tokenizers";
import { whitespaceSplitPreTokenizer } from "../bindings/pre-tokenizers";
import { bpeDecoder } from "../bindings/decoders";
import { bpeTrainer } from "../bindings/trainers";
@ -103,7 +103,7 @@ export class BPETokenizer extends BaseTokenizer {
const normalizer = sequenceNormalizer([nfkcNormalizer(), lowercaseNormalizer()]);
tokenizer.setNormalizer(normalizer);
tokenizer.setPreTokenizer(whitespacePreTokenizer());
tokenizer.setPreTokenizer(whitespaceSplitPreTokenizer());
const decoder = bpeDecoder(mergedOptions.suffix);
tokenizer.setDecoder(decoder);

View File

@ -62,6 +62,17 @@ fn whitespace(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
Ok(pretok)
}
/// whitespace_split()
fn whitespace_split(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
let mut pretok = JsPreTokenizer::new::<_, JsPreTokenizer, _>(&mut cx, vec![])?;
let guard = cx.lock();
pretok
.borrow_mut(&guard)
.pretok
.to_owned(Box::new(tk::pre_tokenizers::whitespace::WhitespaceSplit));
Ok(pretok)
}
/// bert_pre_tokenizer()
fn bert_pre_tokenizer(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
let mut pretok = JsPreTokenizer::new::<_, JsPreTokenizer, _>(&mut cx, vec![])?;
@ -105,6 +116,7 @@ pub fn register(m: &mut ModuleContext, prefix: &str) -> NeonResult<()> {
byte_level_alphabet,
)?;
m.export_function(&format!("{}_Whitespace", prefix), whitespace)?;
m.export_function(&format!("{}_WhitespaceSplit", prefix), whitespace_split)?;
m.export_function(&format!("{}_BertPreTokenizer", prefix), bert_pre_tokenizer)?;
m.export_function(&format!("{}_Metaspace", prefix), metaspace)?;
Ok(())

View File

@ -37,6 +37,7 @@ fn pre_tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<pre_tokenizers::PreTokenizer>()?;
m.add_class::<pre_tokenizers::ByteLevel>()?;
m.add_class::<pre_tokenizers::Whitespace>()?;
m.add_class::<pre_tokenizers::WhitespaceSplit>()?;
m.add_class::<pre_tokenizers::BertPreTokenizer>()?;
m.add_class::<pre_tokenizers::Metaspace>()?;
Ok(())

View File

@ -73,6 +73,18 @@ impl Whitespace {
}
}
#[pyclass]
pub struct WhitespaceSplit {}
#[pymethods]
impl WhitespaceSplit {
#[staticmethod]
fn new() -> PyResult<PreTokenizer> {
Ok(PreTokenizer {
pretok: Container::Owned(Box::new(tk::pre_tokenizers::whitespace::WhitespaceSplit)),
})
}
}
#[pyclass]
pub struct BertPreTokenizer {}
#[pymethods]

View File

@ -30,7 +30,7 @@ class BPETokenizer(BaseTokenizer):
NFKC.new(),
Lowercase.new()
])
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace.new()
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit.new()
tokenizer.decoder = decoders.BPEDecoder.new(suffix=suffix)
parameters = {

View File

@ -3,5 +3,6 @@ from .. import pre_tokenizers
PreTokenizer = pre_tokenizers.PreTokenizer
ByteLevel = pre_tokenizers.ByteLevel
Whitespace = pre_tokenizers.Whitespace
WhitespaceSplit = pre_tokenizers.WhitespaceSplit
BertPreTokenizer = pre_tokenizers.BertPreTokenizer
Metaspace = pre_tokenizers.Metaspace

View File

@ -55,6 +55,17 @@ class Whitespace:
""" Instantiate a new Whitespace PreTokenizer """
pass
class WhitespaceSplit:
""" Whitespace PreTokenizer
This pre-tokenizer simply splits on the whitespace. Works like `.split()`
"""
@staticmethod
def new() -> PreTokenizer:
""" Instantiate a new WhitespaceSplit PreTokenizer """
pass
class BertPreTokenizer:
""" BertPreTokenizer

View File

@ -26,9 +26,36 @@ impl PreTokenizer for Whitespace {
}
}
pub struct WhitespaceSplit;
impl PreTokenizer for WhitespaceSplit {
fn pre_tokenize(&self, s: &str) -> Result<Vec<(String, Offsets)>> {
let mut words = vec![];
let mut word = Vec::with_capacity(1000);
let mut offset = 0;
s.chars().for_each(|c| {
if c.is_whitespace() {
if !word.is_empty() {
let offsets = (offset - word.len(), offset);
words.push((word.drain(0..).collect::<String>(), offsets));
}
} else {
word.push(c);
}
offset += 1;
});
if !word.is_empty() {
let offsets = (offset - word.len(), offset);
words.push((word.drain(0..).collect::<String>(), offsets));
}
Ok(words)
}
}
#[cfg(test)]
mod tests {
use super::Whitespace;
use super::*;
use crate::tokenizer::PreTokenizer;
#[test]
@ -58,4 +85,26 @@ mod tests {
assert_eq!(pretok.pre_tokenize(s).unwrap(), res);
}
}
#[test]
fn whitespace_split() {
let tests = vec![
(
"Hey man!",
vec![("Hey".into(), (0, 3)), ("man!".into(), (4, 8))],
),
(
"Hey, man, Good?",
vec![
("Hey,".into(), (0, 4)),
("man,".into(), (5, 9)),
("Good?".into(), (10, 15)),
],
),
];
let pretok = WhitespaceSplit;
for (s, res) in tests {
assert_eq!(pretok.pre_tokenize(s).unwrap(), res);
}
}
}