Addressing comments:

- Remote Deduplication in favor of WhitespaceSplit.
- Updated comments
This commit is contained in:
Nicolas Patry
2020-08-22 10:04:53 +02:00
committed by Anthony MOI
parent 1f65b4393c
commit 857948e5b8
6 changed files with 5 additions and 73 deletions

View File

@@ -108,20 +108,10 @@ class CharDelimiterSplit(PreTokenizer):
"""
pass
class Deduplication(PreTokenizer):
""" Deduplication PreTokenizer
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
"""
def __init__(self) -> None:
""" Instantiate a new Deduplication PreTokenizer """
pass
class Punctuation(PreTokenizer):
""" Punctuation PreTokenizer
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
This pre-tokenizer simply splits on punctuation as individual characters.`
"""
def __init__(self) -> None:
@@ -131,7 +121,7 @@ class Punctuation(PreTokenizer):
class Sequence(PreTokenizer):
""" Sequence PreTokenizer
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
This pre-tokenizer composes other pre_tokenizers and applies them in sequence`
"""
def __init__(self) -> None:

View File

@@ -8,11 +8,9 @@ use serde::{Deserialize, Deserializer, Serialize, Serializer};
use tk::pre_tokenizers::bert::BertPreTokenizer;
use tk::pre_tokenizers::byte_level::ByteLevel;
use tk::pre_tokenizers::deduplication::Deduplication;
use tk::pre_tokenizers::delimiter::CharDelimiterSplit;
use tk::pre_tokenizers::metaspace::Metaspace;
use tk::pre_tokenizers::punctuation::Punctuation;
// use tk::pre_tokenizers::sequence::Sequence;
use tk::pre_tokenizers::whitespace::{Whitespace, WhitespaceSplit};
use tk::pre_tokenizers::PreTokenizerWrapper;
use tk::tokenizer::Offsets;
@@ -47,9 +45,6 @@ impl PyPreTokenizer {
PreTokenizerWrapper::Whitespace(_) => {
Py::new(py, (PyWhitespace {}, base)).map(Into::into)
}
PreTokenizerWrapper::Deduplication(_) => {
Py::new(py, (PyDeduplication {}, base)).map(Into::into)
}
PreTokenizerWrapper::Punctuation(_) => {
Py::new(py, (PyPunctuation {}, base)).map(Into::into)
}
@@ -217,16 +212,6 @@ impl PyBertPreTokenizer {
}
}
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Deduplication)]
pub struct PyDeduplication {}
#[pymethods]
impl PyDeduplication {
#[new]
fn new() -> PyResult<(Self, PyPreTokenizer)> {
Ok((PyDeduplication {}, Deduplication.into()))
}
}
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Punctuation)]
pub struct PyPunctuation {}
#[pymethods]

View File

@@ -1,38 +0,0 @@
use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior};
#[derive(Copy, Clone, Debug)]
pub struct Deduplication;
impl_serde_unit_struct!(DeduplicationVisitor, Deduplication);
impl PreTokenizer for Deduplication {
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
pretokenized.split(|_, s| s.split(char::is_whitespace, SplitDelimiterBehavior::Removed))
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::OffsetReferential;
#[test]
fn deduplication_basic() {
let pretok = Deduplication;
let mut pretokenized: PreTokenizedString = "Hey friend! How are you?!?".into();
pretok.pre_tokenize(&mut pretokenized).unwrap();
assert_eq!(
pretokenized
.get_splits(OffsetReferential::Original)
.into_iter()
.map(|(s, o, _)| (s, o))
.collect::<Vec<_>>(),
vec![
("Hey", (0, 3)),
("friend!", (4, 11)),
("How", (16, 19)),
("are", (20, 23)),
("you?!?", (24, 30)),
]
);
}
}

View File

@@ -1,6 +1,5 @@
pub mod bert;
pub mod byte_level;
pub mod deduplication;
pub mod delimiter;
pub mod metaspace;
pub mod punctuation;
@@ -11,7 +10,6 @@ use serde::{Deserialize, Serialize};
use crate::pre_tokenizers::bert::BertPreTokenizer;
use crate::pre_tokenizers::byte_level::ByteLevel;
use crate::pre_tokenizers::deduplication::Deduplication;
use crate::pre_tokenizers::delimiter::CharDelimiterSplit;
use crate::pre_tokenizers::metaspace::Metaspace;
use crate::pre_tokenizers::punctuation::Punctuation;
@@ -28,7 +26,6 @@ pub enum PreTokenizerWrapper {
Metaspace(Metaspace),
Whitespace(Whitespace),
Sequence(Sequence),
Deduplication(Deduplication),
Punctuation(Punctuation),
WhitespaceSplit(WhitespaceSplit),
}
@@ -41,7 +38,6 @@ impl PreTokenizer for PreTokenizerWrapper {
PreTokenizerWrapper::Delimiter(dpt) => dpt.pre_tokenize(normalized),
PreTokenizerWrapper::Metaspace(mspt) => mspt.pre_tokenize(normalized),
PreTokenizerWrapper::Whitespace(wspt) => wspt.pre_tokenize(normalized),
PreTokenizerWrapper::Deduplication(tok) => tok.pre_tokenize(normalized),
PreTokenizerWrapper::Punctuation(tok) => tok.pre_tokenize(normalized),
PreTokenizerWrapper::Sequence(tok) => tok.pre_tokenize(normalized),
PreTokenizerWrapper::WhitespaceSplit(wspt) => wspt.pre_tokenize(normalized),
@@ -53,7 +49,6 @@ impl_enum_from!(BertPreTokenizer, PreTokenizerWrapper, BertPreTokenizer);
impl_enum_from!(ByteLevel, PreTokenizerWrapper, ByteLevel);
impl_enum_from!(CharDelimiterSplit, PreTokenizerWrapper, Delimiter);
impl_enum_from!(Whitespace, PreTokenizerWrapper, Whitespace);
impl_enum_from!(Deduplication, PreTokenizerWrapper, Deduplication);
impl_enum_from!(Punctuation, PreTokenizerWrapper, Punctuation);
impl_enum_from!(Sequence, PreTokenizerWrapper, Sequence);
impl_enum_from!(Metaspace, PreTokenizerWrapper, Metaspace);

View File

@@ -11,7 +11,7 @@ impl_serde_unit_struct!(PunctuationVisitor, Punctuation);
impl PreTokenizer for Punctuation {
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
pretokenized.split(|_, s| s.split(is_bert_punc, SplitDelimiterBehavior::Isolated))
pretokenized.split(|_, s| s.split(is_punc, SplitDelimiterBehavior::Isolated))
}
}

View File

@@ -26,13 +26,13 @@ impl PreTokenizer for Sequence {
#[cfg(test)]
mod tests {
use super::*;
use crate::pre_tokenizers::{deduplication::Deduplication, punctuation::Punctuation};
use crate::pre_tokenizers::{punctuation::Punctuation, whitespace::WhitespaceSplit};
use crate::OffsetReferential;
#[test]
fn sequence_basic() {
let pretokenizers = vec![
PreTokenizerWrapper::Deduplication(Deduplication),
PreTokenizerWrapper::WhitespaceSplit(WhitespaceSplit),
PreTokenizerWrapper::Punctuation(Punctuation),
];
let pretok = Sequence::new(pretokenizers);