mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-07 05:08:24 +00:00
Addressing comments:
- Remote Deduplication in favor of WhitespaceSplit. - Updated comments
This commit is contained in:
committed by
Anthony MOI
parent
1f65b4393c
commit
857948e5b8
@@ -108,20 +108,10 @@ class CharDelimiterSplit(PreTokenizer):
|
||||
"""
|
||||
pass
|
||||
|
||||
class Deduplication(PreTokenizer):
|
||||
""" Deduplication PreTokenizer
|
||||
|
||||
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new Deduplication PreTokenizer """
|
||||
pass
|
||||
|
||||
class Punctuation(PreTokenizer):
|
||||
""" Punctuation PreTokenizer
|
||||
|
||||
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
|
||||
This pre-tokenizer simply splits on punctuation as individual characters.`
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
@@ -131,7 +121,7 @@ class Punctuation(PreTokenizer):
|
||||
class Sequence(PreTokenizer):
|
||||
""" Sequence PreTokenizer
|
||||
|
||||
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
|
||||
This pre-tokenizer composes other pre_tokenizers and applies them in sequence`
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
|
||||
@@ -8,11 +8,9 @@ use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
|
||||
use tk::pre_tokenizers::bert::BertPreTokenizer;
|
||||
use tk::pre_tokenizers::byte_level::ByteLevel;
|
||||
use tk::pre_tokenizers::deduplication::Deduplication;
|
||||
use tk::pre_tokenizers::delimiter::CharDelimiterSplit;
|
||||
use tk::pre_tokenizers::metaspace::Metaspace;
|
||||
use tk::pre_tokenizers::punctuation::Punctuation;
|
||||
// use tk::pre_tokenizers::sequence::Sequence;
|
||||
use tk::pre_tokenizers::whitespace::{Whitespace, WhitespaceSplit};
|
||||
use tk::pre_tokenizers::PreTokenizerWrapper;
|
||||
use tk::tokenizer::Offsets;
|
||||
@@ -47,9 +45,6 @@ impl PyPreTokenizer {
|
||||
PreTokenizerWrapper::Whitespace(_) => {
|
||||
Py::new(py, (PyWhitespace {}, base)).map(Into::into)
|
||||
}
|
||||
PreTokenizerWrapper::Deduplication(_) => {
|
||||
Py::new(py, (PyDeduplication {}, base)).map(Into::into)
|
||||
}
|
||||
PreTokenizerWrapper::Punctuation(_) => {
|
||||
Py::new(py, (PyPunctuation {}, base)).map(Into::into)
|
||||
}
|
||||
@@ -217,16 +212,6 @@ impl PyBertPreTokenizer {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Deduplication)]
|
||||
pub struct PyDeduplication {}
|
||||
#[pymethods]
|
||||
impl PyDeduplication {
|
||||
#[new]
|
||||
fn new() -> PyResult<(Self, PyPreTokenizer)> {
|
||||
Ok((PyDeduplication {}, Deduplication.into()))
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Punctuation)]
|
||||
pub struct PyPunctuation {}
|
||||
#[pymethods]
|
||||
|
||||
@@ -1,38 +0,0 @@
|
||||
use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior};
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct Deduplication;
|
||||
impl_serde_unit_struct!(DeduplicationVisitor, Deduplication);
|
||||
|
||||
impl PreTokenizer for Deduplication {
|
||||
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
|
||||
pretokenized.split(|_, s| s.split(char::is_whitespace, SplitDelimiterBehavior::Removed))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::OffsetReferential;
|
||||
|
||||
#[test]
|
||||
fn deduplication_basic() {
|
||||
let pretok = Deduplication;
|
||||
let mut pretokenized: PreTokenizedString = "Hey friend! How are you?!?".into();
|
||||
pretok.pre_tokenize(&mut pretokenized).unwrap();
|
||||
assert_eq!(
|
||||
pretokenized
|
||||
.get_splits(OffsetReferential::Original)
|
||||
.into_iter()
|
||||
.map(|(s, o, _)| (s, o))
|
||||
.collect::<Vec<_>>(),
|
||||
vec![
|
||||
("Hey", (0, 3)),
|
||||
("friend!", (4, 11)),
|
||||
("How", (16, 19)),
|
||||
("are", (20, 23)),
|
||||
("you?!?", (24, 30)),
|
||||
]
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,5 @@
|
||||
pub mod bert;
|
||||
pub mod byte_level;
|
||||
pub mod deduplication;
|
||||
pub mod delimiter;
|
||||
pub mod metaspace;
|
||||
pub mod punctuation;
|
||||
@@ -11,7 +10,6 @@ use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::pre_tokenizers::bert::BertPreTokenizer;
|
||||
use crate::pre_tokenizers::byte_level::ByteLevel;
|
||||
use crate::pre_tokenizers::deduplication::Deduplication;
|
||||
use crate::pre_tokenizers::delimiter::CharDelimiterSplit;
|
||||
use crate::pre_tokenizers::metaspace::Metaspace;
|
||||
use crate::pre_tokenizers::punctuation::Punctuation;
|
||||
@@ -28,7 +26,6 @@ pub enum PreTokenizerWrapper {
|
||||
Metaspace(Metaspace),
|
||||
Whitespace(Whitespace),
|
||||
Sequence(Sequence),
|
||||
Deduplication(Deduplication),
|
||||
Punctuation(Punctuation),
|
||||
WhitespaceSplit(WhitespaceSplit),
|
||||
}
|
||||
@@ -41,7 +38,6 @@ impl PreTokenizer for PreTokenizerWrapper {
|
||||
PreTokenizerWrapper::Delimiter(dpt) => dpt.pre_tokenize(normalized),
|
||||
PreTokenizerWrapper::Metaspace(mspt) => mspt.pre_tokenize(normalized),
|
||||
PreTokenizerWrapper::Whitespace(wspt) => wspt.pre_tokenize(normalized),
|
||||
PreTokenizerWrapper::Deduplication(tok) => tok.pre_tokenize(normalized),
|
||||
PreTokenizerWrapper::Punctuation(tok) => tok.pre_tokenize(normalized),
|
||||
PreTokenizerWrapper::Sequence(tok) => tok.pre_tokenize(normalized),
|
||||
PreTokenizerWrapper::WhitespaceSplit(wspt) => wspt.pre_tokenize(normalized),
|
||||
@@ -53,7 +49,6 @@ impl_enum_from!(BertPreTokenizer, PreTokenizerWrapper, BertPreTokenizer);
|
||||
impl_enum_from!(ByteLevel, PreTokenizerWrapper, ByteLevel);
|
||||
impl_enum_from!(CharDelimiterSplit, PreTokenizerWrapper, Delimiter);
|
||||
impl_enum_from!(Whitespace, PreTokenizerWrapper, Whitespace);
|
||||
impl_enum_from!(Deduplication, PreTokenizerWrapper, Deduplication);
|
||||
impl_enum_from!(Punctuation, PreTokenizerWrapper, Punctuation);
|
||||
impl_enum_from!(Sequence, PreTokenizerWrapper, Sequence);
|
||||
impl_enum_from!(Metaspace, PreTokenizerWrapper, Metaspace);
|
||||
|
||||
@@ -11,7 +11,7 @@ impl_serde_unit_struct!(PunctuationVisitor, Punctuation);
|
||||
|
||||
impl PreTokenizer for Punctuation {
|
||||
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
|
||||
pretokenized.split(|_, s| s.split(is_bert_punc, SplitDelimiterBehavior::Isolated))
|
||||
pretokenized.split(|_, s| s.split(is_punc, SplitDelimiterBehavior::Isolated))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -26,13 +26,13 @@ impl PreTokenizer for Sequence {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::pre_tokenizers::{deduplication::Deduplication, punctuation::Punctuation};
|
||||
use crate::pre_tokenizers::{punctuation::Punctuation, whitespace::WhitespaceSplit};
|
||||
use crate::OffsetReferential;
|
||||
|
||||
#[test]
|
||||
fn sequence_basic() {
|
||||
let pretokenizers = vec![
|
||||
PreTokenizerWrapper::Deduplication(Deduplication),
|
||||
PreTokenizerWrapper::WhitespaceSplit(WhitespaceSplit),
|
||||
PreTokenizerWrapper::Punctuation(Punctuation),
|
||||
];
|
||||
let pretok = Sequence::new(pretokenizers);
|
||||
|
||||
Reference in New Issue
Block a user