Char based delimiter splitting - TransfoXL (#114)

* WIP delimiter splitter

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Bind on Python side.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Add missing delimiter parameter in CharDelimiterSplit constructor.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Attempt to provide CharDelimiterSplit for node.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Apply Rust formatting.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* fix bindings node

Co-authored-by: Pierric Cistac <Pierrci@users.noreply.github.com>
This commit is contained in:
Funtowicz Morgan
2020-02-04 16:23:00 +00:00
committed by GitHub
parent 3adb220973
commit 6165910ca6
9 changed files with 112 additions and 7 deletions

View File

@@ -45,7 +45,7 @@ export function whitespaceSplitPreTokenizer(): PreTokenizer;
export function bertPreTokenizer(): PreTokenizer; export function bertPreTokenizer(): PreTokenizer;
/** /**
* Returns a new Metaspace Tokenizer. * Returns a new Metaspace PreTokenizer.
* This pre-tokenizer replaces any whitespace by the provided replacement character. * This pre-tokenizer replaces any whitespace by the provided replacement character.
* It then tries to split on these spaces. * It then tries to split on these spaces.
* *
@@ -58,3 +58,12 @@ export function metaspacePreTokenizer(
replacement?: string, replacement?: string,
addPrefixSpace?: boolean addPrefixSpace?: boolean
): PreTokenizer; ): PreTokenizer;
/**
* Returns a CharDelimiterSplit PreTokenizer
* This pre-tokenizer simply splits on the provided delimiter. Works almost like the `.split(delimiter)`
* function, except that it accounts for multiple consecutive spaces
*
* @param delimiter The delimiter character on which the sequence will be split.
*/
export function charDelimiterSplitPreTokenizer(delimiter: string): PreTokenizer;

View File

@@ -6,5 +6,6 @@ module.exports = {
whitespacePreTokenizer: native.pre_tokenizers_Whitespace, whitespacePreTokenizer: native.pre_tokenizers_Whitespace,
whitespaceSplitPreTokenizer: native.pre_tokenizers_WhitespaceSplit, whitespaceSplitPreTokenizer: native.pre_tokenizers_WhitespaceSplit,
bertPreTokenizer: native.pre_tokenizers_BertPreTokenizer, bertPreTokenizer: native.pre_tokenizers_BertPreTokenizer,
metaspacePreTokenizer: native.pre_tokenizers_Metaspace metaspacePreTokenizer: native.pre_tokenizers_Metaspace,
charDelimiterSplitPreTokenizer: native.pre_tokenizers_CharDelimiterSplit
}; };

View File

@@ -108,6 +108,23 @@ fn metaspace(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
Ok(pretok) Ok(pretok)
} }
/// char_delimiter_split(delimiter: string)
fn char_delimiter_split(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
let argument = cx.argument::<JsString>(0)?.value();
let delimiter = argument.chars().nth(0).ok_or_else(|| {
cx.throw_error::<_, ()>("delimiter must be a character")
.unwrap_err()
})?;
let mut pretok = JsPreTokenizer::new::<_, JsPreTokenizer, _>(&mut cx, vec![])?;
let guard = cx.lock();
pretok.borrow_mut(&guard).pretok.to_owned(Box::new(
tk::pre_tokenizers::delimiter::CharDelimiterSplit::new(delimiter),
));
Ok(pretok)
}
/// Register everything here /// Register everything here
pub fn register(m: &mut ModuleContext, prefix: &str) -> NeonResult<()> { pub fn register(m: &mut ModuleContext, prefix: &str) -> NeonResult<()> {
m.export_function(&format!("{}_ByteLevel", prefix), byte_level)?; m.export_function(&format!("{}_ByteLevel", prefix), byte_level)?;
@@ -119,5 +136,9 @@ pub fn register(m: &mut ModuleContext, prefix: &str) -> NeonResult<()> {
m.export_function(&format!("{}_WhitespaceSplit", prefix), whitespace_split)?; m.export_function(&format!("{}_WhitespaceSplit", prefix), whitespace_split)?;
m.export_function(&format!("{}_BertPreTokenizer", prefix), bert_pre_tokenizer)?; m.export_function(&format!("{}_BertPreTokenizer", prefix), bert_pre_tokenizer)?;
m.export_function(&format!("{}_Metaspace", prefix), metaspace)?; m.export_function(&format!("{}_Metaspace", prefix), metaspace)?;
m.export_function(
&format!("{}_CharDelimiterSplit", prefix),
char_delimiter_split,
)?;
Ok(()) Ok(())
} }

View File

@@ -40,6 +40,7 @@ fn pre_tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<pre_tokenizers::WhitespaceSplit>()?; m.add_class::<pre_tokenizers::WhitespaceSplit>()?;
m.add_class::<pre_tokenizers::BertPreTokenizer>()?; m.add_class::<pre_tokenizers::BertPreTokenizer>()?;
m.add_class::<pre_tokenizers::Metaspace>()?; m.add_class::<pre_tokenizers::Metaspace>()?;
m.add_class::<pre_tokenizers::CharDelimiterSplit>()?;
Ok(()) Ok(())
} }

View File

@@ -85,6 +85,23 @@ impl WhitespaceSplit {
} }
} }
#[pyclass]
pub struct CharDelimiterSplit {}
#[pymethods]
impl CharDelimiterSplit {
#[staticmethod]
pub fn new(delimiter: &str) -> PyResult<PreTokenizer> {
let chr_delimiter = delimiter.chars().nth(0).ok_or(exceptions::Exception::py_err(
"delimiter must be a single character",
))?;
Ok(PreTokenizer{
pretok:Container::Owned(Box::new(
tk::pre_tokenizers::delimiter::CharDelimiterSplit::new(chr_delimiter)
))
})
}
}
#[pyclass] #[pyclass]
pub struct BertPreTokenizer {} pub struct BertPreTokenizer {}
#[pymethods] #[pymethods]

View File

@@ -6,3 +6,4 @@ Whitespace = pre_tokenizers.Whitespace
WhitespaceSplit = pre_tokenizers.WhitespaceSplit WhitespaceSplit = pre_tokenizers.WhitespaceSplit
BertPreTokenizer = pre_tokenizers.BertPreTokenizer BertPreTokenizer = pre_tokenizers.BertPreTokenizer
Metaspace = pre_tokenizers.Metaspace Metaspace = pre_tokenizers.Metaspace
CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit

View File

@@ -13,7 +13,7 @@ class PreTokenizer:
""" Pre tokenize the given sequence """ """ Pre tokenize the given sequence """
pass pass
class ByteLevel: class ByteLevel(PreTokenizer):
""" ByteLevel PreTokenizer """ ByteLevel PreTokenizer
This pre-tokenizer takes care of replacing all bytes of the given string This pre-tokenizer takes care of replacing all bytes of the given string
@@ -44,7 +44,7 @@ class ByteLevel:
""" """
pass pass
class Whitespace: class Whitespace(PreTokenizer):
""" Whitespace PreTokenizer """ Whitespace PreTokenizer
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+` This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
@@ -55,7 +55,7 @@ class Whitespace:
""" Instantiate a new Whitespace PreTokenizer """ """ Instantiate a new Whitespace PreTokenizer """
pass pass
class WhitespaceSplit: class WhitespaceSplit(PreTokenizer):
""" Whitespace PreTokenizer """ Whitespace PreTokenizer
This pre-tokenizer simply splits on the whitespace. Works like `.split()` This pre-tokenizer simply splits on the whitespace. Works like `.split()`
@@ -66,7 +66,7 @@ class WhitespaceSplit:
""" Instantiate a new WhitespaceSplit PreTokenizer """ """ Instantiate a new WhitespaceSplit PreTokenizer """
pass pass
class BertPreTokenizer: class BertPreTokenizer(PreTokenizer):
""" BertPreTokenizer """ BertPreTokenizer
This pre-tokenizer splits tokens on spaces, and also on punctuation. This pre-tokenizer splits tokens on spaces, and also on punctuation.
@@ -78,7 +78,7 @@ class BertPreTokenizer:
""" Instantiate a new BertPreTokenizer """ """ Instantiate a new BertPreTokenizer """
pass pass
class Metaspace: class Metaspace(PreTokenizer):
""" Metaspace pre-tokenizer """ Metaspace pre-tokenizer
This pre-tokenizer replaces any whitespace by the provided replacement character. This pre-tokenizer replaces any whitespace by the provided replacement character.
@@ -100,3 +100,20 @@ class Metaspace:
lets us treat `hello` exactly like `say hello`. lets us treat `hello` exactly like `say hello`.
""" """
pass pass
class CharDelimiterSplit(PreTokenizer):
""" CharDelimiterSplit PreTokenizer
This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
"""
@staticmethod
def new(delimiter: str) -> PreTokenizer:
""" Instantiate a new CharDelimiterSplit PreTokenizer
Args:
delimiter: str:
The delimiter char that will be used to split input
"""
pass

View File

@@ -0,0 +1,37 @@
use crate::tokenizer::{Offsets, PreTokenizer, Result};
pub struct CharDelimiterSplit {
delimiter: char,
}
impl CharDelimiterSplit {
pub fn new(delimiter: char) -> Self {
CharDelimiterSplit { delimiter }
}
}
impl PreTokenizer for CharDelimiterSplit {
fn pre_tokenize(&self, s: &str) -> Result<Vec<(String, Offsets)>> {
let mut words = vec![];
let mut word = Vec::with_capacity(1000);
let mut offset = 0;
s.chars().for_each(|c| {
if c == self.delimiter {
if !word.is_empty() {
let offsets = (offset - word.len(), offset);
words.push((word.drain(0..).collect::<String>(), offsets));
}
} else {
word.push(c);
}
offset += 1;
});
if !word.is_empty() {
let offsets = (offset - word.len(), offset);
words.push((word.drain(0..).collect::<String>(), offsets));
}
Ok(words)
}
}

View File

@@ -1,4 +1,5 @@
pub mod bert; pub mod bert;
pub mod byte_level; pub mod byte_level;
pub mod delimiter;
pub mod metaspace; pub mod metaspace;
pub mod whitespace; pub mod whitespace;