mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-07 05:08:24 +00:00
Addressing comments:
- Remote Deduplication in favor of WhitespaceSplit. - Updated comments
This commit is contained in:
committed by
Anthony MOI
parent
1f65b4393c
commit
857948e5b8
@@ -108,20 +108,10 @@ class CharDelimiterSplit(PreTokenizer):
|
|||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class Deduplication(PreTokenizer):
|
|
||||||
""" Deduplication PreTokenizer
|
|
||||||
|
|
||||||
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self) -> None:
|
|
||||||
""" Instantiate a new Deduplication PreTokenizer """
|
|
||||||
pass
|
|
||||||
|
|
||||||
class Punctuation(PreTokenizer):
|
class Punctuation(PreTokenizer):
|
||||||
""" Punctuation PreTokenizer
|
""" Punctuation PreTokenizer
|
||||||
|
|
||||||
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
|
This pre-tokenizer simply splits on punctuation as individual characters.`
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
@@ -131,7 +121,7 @@ class Punctuation(PreTokenizer):
|
|||||||
class Sequence(PreTokenizer):
|
class Sequence(PreTokenizer):
|
||||||
""" Sequence PreTokenizer
|
""" Sequence PreTokenizer
|
||||||
|
|
||||||
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
|
This pre-tokenizer composes other pre_tokenizers and applies them in sequence`
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
|
|||||||
@@ -8,11 +8,9 @@ use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
|||||||
|
|
||||||
use tk::pre_tokenizers::bert::BertPreTokenizer;
|
use tk::pre_tokenizers::bert::BertPreTokenizer;
|
||||||
use tk::pre_tokenizers::byte_level::ByteLevel;
|
use tk::pre_tokenizers::byte_level::ByteLevel;
|
||||||
use tk::pre_tokenizers::deduplication::Deduplication;
|
|
||||||
use tk::pre_tokenizers::delimiter::CharDelimiterSplit;
|
use tk::pre_tokenizers::delimiter::CharDelimiterSplit;
|
||||||
use tk::pre_tokenizers::metaspace::Metaspace;
|
use tk::pre_tokenizers::metaspace::Metaspace;
|
||||||
use tk::pre_tokenizers::punctuation::Punctuation;
|
use tk::pre_tokenizers::punctuation::Punctuation;
|
||||||
// use tk::pre_tokenizers::sequence::Sequence;
|
|
||||||
use tk::pre_tokenizers::whitespace::{Whitespace, WhitespaceSplit};
|
use tk::pre_tokenizers::whitespace::{Whitespace, WhitespaceSplit};
|
||||||
use tk::pre_tokenizers::PreTokenizerWrapper;
|
use tk::pre_tokenizers::PreTokenizerWrapper;
|
||||||
use tk::tokenizer::Offsets;
|
use tk::tokenizer::Offsets;
|
||||||
@@ -47,9 +45,6 @@ impl PyPreTokenizer {
|
|||||||
PreTokenizerWrapper::Whitespace(_) => {
|
PreTokenizerWrapper::Whitespace(_) => {
|
||||||
Py::new(py, (PyWhitespace {}, base)).map(Into::into)
|
Py::new(py, (PyWhitespace {}, base)).map(Into::into)
|
||||||
}
|
}
|
||||||
PreTokenizerWrapper::Deduplication(_) => {
|
|
||||||
Py::new(py, (PyDeduplication {}, base)).map(Into::into)
|
|
||||||
}
|
|
||||||
PreTokenizerWrapper::Punctuation(_) => {
|
PreTokenizerWrapper::Punctuation(_) => {
|
||||||
Py::new(py, (PyPunctuation {}, base)).map(Into::into)
|
Py::new(py, (PyPunctuation {}, base)).map(Into::into)
|
||||||
}
|
}
|
||||||
@@ -217,16 +212,6 @@ impl PyBertPreTokenizer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Deduplication)]
|
|
||||||
pub struct PyDeduplication {}
|
|
||||||
#[pymethods]
|
|
||||||
impl PyDeduplication {
|
|
||||||
#[new]
|
|
||||||
fn new() -> PyResult<(Self, PyPreTokenizer)> {
|
|
||||||
Ok((PyDeduplication {}, Deduplication.into()))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Punctuation)]
|
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Punctuation)]
|
||||||
pub struct PyPunctuation {}
|
pub struct PyPunctuation {}
|
||||||
#[pymethods]
|
#[pymethods]
|
||||||
|
|||||||
@@ -1,38 +0,0 @@
|
|||||||
use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior};
|
|
||||||
|
|
||||||
#[derive(Copy, Clone, Debug)]
|
|
||||||
pub struct Deduplication;
|
|
||||||
impl_serde_unit_struct!(DeduplicationVisitor, Deduplication);
|
|
||||||
|
|
||||||
impl PreTokenizer for Deduplication {
|
|
||||||
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
|
|
||||||
pretokenized.split(|_, s| s.split(char::is_whitespace, SplitDelimiterBehavior::Removed))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use super::*;
|
|
||||||
use crate::OffsetReferential;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn deduplication_basic() {
|
|
||||||
let pretok = Deduplication;
|
|
||||||
let mut pretokenized: PreTokenizedString = "Hey friend! How are you?!?".into();
|
|
||||||
pretok.pre_tokenize(&mut pretokenized).unwrap();
|
|
||||||
assert_eq!(
|
|
||||||
pretokenized
|
|
||||||
.get_splits(OffsetReferential::Original)
|
|
||||||
.into_iter()
|
|
||||||
.map(|(s, o, _)| (s, o))
|
|
||||||
.collect::<Vec<_>>(),
|
|
||||||
vec![
|
|
||||||
("Hey", (0, 3)),
|
|
||||||
("friend!", (4, 11)),
|
|
||||||
("How", (16, 19)),
|
|
||||||
("are", (20, 23)),
|
|
||||||
("you?!?", (24, 30)),
|
|
||||||
]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,6 +1,5 @@
|
|||||||
pub mod bert;
|
pub mod bert;
|
||||||
pub mod byte_level;
|
pub mod byte_level;
|
||||||
pub mod deduplication;
|
|
||||||
pub mod delimiter;
|
pub mod delimiter;
|
||||||
pub mod metaspace;
|
pub mod metaspace;
|
||||||
pub mod punctuation;
|
pub mod punctuation;
|
||||||
@@ -11,7 +10,6 @@ use serde::{Deserialize, Serialize};
|
|||||||
|
|
||||||
use crate::pre_tokenizers::bert::BertPreTokenizer;
|
use crate::pre_tokenizers::bert::BertPreTokenizer;
|
||||||
use crate::pre_tokenizers::byte_level::ByteLevel;
|
use crate::pre_tokenizers::byte_level::ByteLevel;
|
||||||
use crate::pre_tokenizers::deduplication::Deduplication;
|
|
||||||
use crate::pre_tokenizers::delimiter::CharDelimiterSplit;
|
use crate::pre_tokenizers::delimiter::CharDelimiterSplit;
|
||||||
use crate::pre_tokenizers::metaspace::Metaspace;
|
use crate::pre_tokenizers::metaspace::Metaspace;
|
||||||
use crate::pre_tokenizers::punctuation::Punctuation;
|
use crate::pre_tokenizers::punctuation::Punctuation;
|
||||||
@@ -28,7 +26,6 @@ pub enum PreTokenizerWrapper {
|
|||||||
Metaspace(Metaspace),
|
Metaspace(Metaspace),
|
||||||
Whitespace(Whitespace),
|
Whitespace(Whitespace),
|
||||||
Sequence(Sequence),
|
Sequence(Sequence),
|
||||||
Deduplication(Deduplication),
|
|
||||||
Punctuation(Punctuation),
|
Punctuation(Punctuation),
|
||||||
WhitespaceSplit(WhitespaceSplit),
|
WhitespaceSplit(WhitespaceSplit),
|
||||||
}
|
}
|
||||||
@@ -41,7 +38,6 @@ impl PreTokenizer for PreTokenizerWrapper {
|
|||||||
PreTokenizerWrapper::Delimiter(dpt) => dpt.pre_tokenize(normalized),
|
PreTokenizerWrapper::Delimiter(dpt) => dpt.pre_tokenize(normalized),
|
||||||
PreTokenizerWrapper::Metaspace(mspt) => mspt.pre_tokenize(normalized),
|
PreTokenizerWrapper::Metaspace(mspt) => mspt.pre_tokenize(normalized),
|
||||||
PreTokenizerWrapper::Whitespace(wspt) => wspt.pre_tokenize(normalized),
|
PreTokenizerWrapper::Whitespace(wspt) => wspt.pre_tokenize(normalized),
|
||||||
PreTokenizerWrapper::Deduplication(tok) => tok.pre_tokenize(normalized),
|
|
||||||
PreTokenizerWrapper::Punctuation(tok) => tok.pre_tokenize(normalized),
|
PreTokenizerWrapper::Punctuation(tok) => tok.pre_tokenize(normalized),
|
||||||
PreTokenizerWrapper::Sequence(tok) => tok.pre_tokenize(normalized),
|
PreTokenizerWrapper::Sequence(tok) => tok.pre_tokenize(normalized),
|
||||||
PreTokenizerWrapper::WhitespaceSplit(wspt) => wspt.pre_tokenize(normalized),
|
PreTokenizerWrapper::WhitespaceSplit(wspt) => wspt.pre_tokenize(normalized),
|
||||||
@@ -53,7 +49,6 @@ impl_enum_from!(BertPreTokenizer, PreTokenizerWrapper, BertPreTokenizer);
|
|||||||
impl_enum_from!(ByteLevel, PreTokenizerWrapper, ByteLevel);
|
impl_enum_from!(ByteLevel, PreTokenizerWrapper, ByteLevel);
|
||||||
impl_enum_from!(CharDelimiterSplit, PreTokenizerWrapper, Delimiter);
|
impl_enum_from!(CharDelimiterSplit, PreTokenizerWrapper, Delimiter);
|
||||||
impl_enum_from!(Whitespace, PreTokenizerWrapper, Whitespace);
|
impl_enum_from!(Whitespace, PreTokenizerWrapper, Whitespace);
|
||||||
impl_enum_from!(Deduplication, PreTokenizerWrapper, Deduplication);
|
|
||||||
impl_enum_from!(Punctuation, PreTokenizerWrapper, Punctuation);
|
impl_enum_from!(Punctuation, PreTokenizerWrapper, Punctuation);
|
||||||
impl_enum_from!(Sequence, PreTokenizerWrapper, Sequence);
|
impl_enum_from!(Sequence, PreTokenizerWrapper, Sequence);
|
||||||
impl_enum_from!(Metaspace, PreTokenizerWrapper, Metaspace);
|
impl_enum_from!(Metaspace, PreTokenizerWrapper, Metaspace);
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ impl_serde_unit_struct!(PunctuationVisitor, Punctuation);
|
|||||||
|
|
||||||
impl PreTokenizer for Punctuation {
|
impl PreTokenizer for Punctuation {
|
||||||
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
|
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
|
||||||
pretokenized.split(|_, s| s.split(is_bert_punc, SplitDelimiterBehavior::Isolated))
|
pretokenized.split(|_, s| s.split(is_punc, SplitDelimiterBehavior::Isolated))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -26,13 +26,13 @@ impl PreTokenizer for Sequence {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::pre_tokenizers::{deduplication::Deduplication, punctuation::Punctuation};
|
use crate::pre_tokenizers::{punctuation::Punctuation, whitespace::WhitespaceSplit};
|
||||||
use crate::OffsetReferential;
|
use crate::OffsetReferential;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn sequence_basic() {
|
fn sequence_basic() {
|
||||||
let pretokenizers = vec![
|
let pretokenizers = vec![
|
||||||
PreTokenizerWrapper::Deduplication(Deduplication),
|
PreTokenizerWrapper::WhitespaceSplit(WhitespaceSplit),
|
||||||
PreTokenizerWrapper::Punctuation(Punctuation),
|
PreTokenizerWrapper::Punctuation(Punctuation),
|
||||||
];
|
];
|
||||||
let pretok = Sequence::new(pretokenizers);
|
let pretok = Sequence::new(pretokenizers);
|
||||||
|
|||||||
Reference in New Issue
Block a user