Char based delimiter splitting - TransfoXL (#114)

* WIP delimiter splitter

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Bind on Python side.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Add missing delimiter parameter in CharDelimiterSplit constructor.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Attempt to provide CharDelimiterSplit for node.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Apply Rust formatting.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* fix bindings node

Co-authored-by: Pierric Cistac <Pierrci@users.noreply.github.com>
This commit is contained in:
Funtowicz Morgan
2020-02-04 16:23:00 +00:00
committed by GitHub
parent 3adb220973
commit 6165910ca6
9 changed files with 112 additions and 7 deletions

View File

@@ -40,6 +40,7 @@ fn pre_tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<pre_tokenizers::WhitespaceSplit>()?;
m.add_class::<pre_tokenizers::BertPreTokenizer>()?;
m.add_class::<pre_tokenizers::Metaspace>()?;
m.add_class::<pre_tokenizers::CharDelimiterSplit>()?;
Ok(())
}

View File

@@ -85,6 +85,23 @@ impl WhitespaceSplit {
}
}
#[pyclass]
pub struct CharDelimiterSplit {}
#[pymethods]
impl CharDelimiterSplit {
#[staticmethod]
pub fn new(delimiter: &str) -> PyResult<PreTokenizer> {
let chr_delimiter = delimiter.chars().nth(0).ok_or(exceptions::Exception::py_err(
"delimiter must be a single character",
))?;
Ok(PreTokenizer{
pretok:Container::Owned(Box::new(
tk::pre_tokenizers::delimiter::CharDelimiterSplit::new(chr_delimiter)
))
})
}
}
#[pyclass]
pub struct BertPreTokenizer {}
#[pymethods]

View File

@@ -6,3 +6,4 @@ Whitespace = pre_tokenizers.Whitespace
WhitespaceSplit = pre_tokenizers.WhitespaceSplit
BertPreTokenizer = pre_tokenizers.BertPreTokenizer
Metaspace = pre_tokenizers.Metaspace
CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit

View File

@@ -13,7 +13,7 @@ class PreTokenizer:
""" Pre tokenize the given sequence """
pass
class ByteLevel:
class ByteLevel(PreTokenizer):
""" ByteLevel PreTokenizer
This pre-tokenizer takes care of replacing all bytes of the given string
@@ -44,7 +44,7 @@ class ByteLevel:
"""
pass
class Whitespace:
class Whitespace(PreTokenizer):
""" Whitespace PreTokenizer
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
@@ -55,7 +55,7 @@ class Whitespace:
""" Instantiate a new Whitespace PreTokenizer """
pass
class WhitespaceSplit:
class WhitespaceSplit(PreTokenizer):
""" Whitespace PreTokenizer
This pre-tokenizer simply splits on the whitespace. Works like `.split()`
@@ -66,7 +66,7 @@ class WhitespaceSplit:
""" Instantiate a new WhitespaceSplit PreTokenizer """
pass
class BertPreTokenizer:
class BertPreTokenizer(PreTokenizer):
""" BertPreTokenizer
This pre-tokenizer splits tokens on spaces, and also on punctuation.
@@ -78,7 +78,7 @@ class BertPreTokenizer:
""" Instantiate a new BertPreTokenizer """
pass
class Metaspace:
class Metaspace(PreTokenizer):
""" Metaspace pre-tokenizer
This pre-tokenizer replaces any whitespace by the provided replacement character.
@@ -100,3 +100,20 @@ class Metaspace:
lets us treat `hello` exactly like `say hello`.
"""
pass
class CharDelimiterSplit(PreTokenizer):
""" CharDelimiterSplit PreTokenizer
This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
"""
@staticmethod
def new(delimiter: str) -> PreTokenizer:
""" Instantiate a new CharDelimiterSplit PreTokenizer
Args:
delimiter: str:
The delimiter char that will be used to split input
"""
pass