mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-31 12:39:21 +00:00
Python - Add NormalizedString + doc/typings
This commit is contained in:
@ -1,6 +1,7 @@
|
|||||||
extern crate tokenizers as tk;
|
extern crate tokenizers as tk;
|
||||||
|
|
||||||
use crate::error::PyError;
|
use crate::error::PyError;
|
||||||
|
use crate::normalized_string::NormalizedString;
|
||||||
use pyo3::prelude::*;
|
use pyo3::prelude::*;
|
||||||
use pyo3::types::*;
|
use pyo3::types::*;
|
||||||
use tk::tokenizer::PaddingDirection;
|
use tk::tokenizer::PaddingDirection;
|
||||||
@ -20,13 +21,8 @@ impl Encoding {
|
|||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl Encoding {
|
impl Encoding {
|
||||||
#[getter]
|
#[getter]
|
||||||
fn get_original(&self) -> String {
|
fn get_normalized(&self) -> NormalizedString {
|
||||||
self.encoding.get_normalized().get_original().to_owned()
|
NormalizedString::new(self.encoding.get_normalized().clone())
|
||||||
}
|
|
||||||
|
|
||||||
#[getter]
|
|
||||||
fn get_normalized(&self) -> String {
|
|
||||||
self.encoding.get_normalized().get().to_owned()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[args(kwargs = "**")]
|
#[args(kwargs = "**")]
|
||||||
|
@ -2,6 +2,7 @@ mod decoders;
|
|||||||
mod encoding;
|
mod encoding;
|
||||||
mod error;
|
mod error;
|
||||||
mod models;
|
mod models;
|
||||||
|
mod normalized_string;
|
||||||
mod normalizers;
|
mod normalizers;
|
||||||
mod pre_tokenizers;
|
mod pre_tokenizers;
|
||||||
mod processors;
|
mod processors;
|
||||||
@ -70,6 +71,8 @@ fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
|||||||
#[pymodule]
|
#[pymodule]
|
||||||
fn tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
fn tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||||
m.add_class::<tokenizer::Tokenizer>()?;
|
m.add_class::<tokenizer::Tokenizer>()?;
|
||||||
|
m.add_class::<encoding::Encoding>()?;
|
||||||
|
m.add_class::<normalized_string::NormalizedString>()?;
|
||||||
m.add_wrapped(wrap_pymodule!(models))?;
|
m.add_wrapped(wrap_pymodule!(models))?;
|
||||||
m.add_wrapped(wrap_pymodule!(pre_tokenizers))?;
|
m.add_wrapped(wrap_pymodule!(pre_tokenizers))?;
|
||||||
m.add_wrapped(wrap_pymodule!(decoders))?;
|
m.add_wrapped(wrap_pymodule!(decoders))?;
|
||||||
|
35
bindings/python/src/normalized_string.rs
Normal file
35
bindings/python/src/normalized_string.rs
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
extern crate tokenizers as tk;
|
||||||
|
|
||||||
|
use pyo3::prelude::*;
|
||||||
|
|
||||||
|
#[pyclass]
|
||||||
|
#[repr(transparent)]
|
||||||
|
pub struct NormalizedString {
|
||||||
|
s: tk::tokenizer::NormalizedString,
|
||||||
|
}
|
||||||
|
impl NormalizedString {
|
||||||
|
pub fn new(s: tk::tokenizer::NormalizedString) -> NormalizedString {
|
||||||
|
NormalizedString { s }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[pymethods]
|
||||||
|
impl NormalizedString {
|
||||||
|
#[getter]
|
||||||
|
fn get_original(&self) -> String {
|
||||||
|
self.s.get_original().to_owned()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[getter]
|
||||||
|
fn get_normalized(&self) -> String {
|
||||||
|
self.s.get().to_owned()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_range(&self, start: usize, end: usize) -> Option<String> {
|
||||||
|
self.s.get_range(start..end).map(|s| s.to_owned())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_range_original(&self, start: usize, end: usize) -> Option<String> {
|
||||||
|
self.s.get_range_original(start..end).map(|s| s.to_owned())
|
||||||
|
}
|
||||||
|
}
|
@ -9,10 +9,62 @@ from typing import Optional, Union, List, Tuple
|
|||||||
|
|
||||||
Offsets = Tuple[int, int]
|
Offsets = Tuple[int, int]
|
||||||
|
|
||||||
|
class NormalizedString:
|
||||||
|
""" A NormalizedString produced during normalization """
|
||||||
|
|
||||||
|
@property
|
||||||
|
def original(self) -> str:
|
||||||
|
""" The original string """
|
||||||
|
pass
|
||||||
|
|
||||||
|
@property
|
||||||
|
def normalized(self) -> str:
|
||||||
|
""" The normalized string """
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_range(self, start: int, end: int) -> Optional[str]:
|
||||||
|
""" Return a range of the normalized string, if the bounds are correct
|
||||||
|
|
||||||
|
Args:
|
||||||
|
start: int:
|
||||||
|
The starting offset in the string
|
||||||
|
|
||||||
|
end: int:
|
||||||
|
The ending offset in the string
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The substring if the bounds are correct
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_range_original(self, start: int, end: int) -> Optional[str]:
|
||||||
|
""" Return a range of the original string, if the bounds are correct
|
||||||
|
|
||||||
|
The given bounds are supposed to be after-normalization-offsets.
|
||||||
|
Provided with the `Encoding.offsets` associated with an `Encoding.ids` unit,
|
||||||
|
this method will return the part of the original string corresponding to the id.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
start: int:
|
||||||
|
The starting offset in the normalized string
|
||||||
|
|
||||||
|
end: int:
|
||||||
|
The ending offset in the normalized string
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The substring if the bounds are correct
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class Encoding:
|
class Encoding:
|
||||||
""" An Encoding as returned by the Tokenizer """
|
""" An Encoding as returned by the Tokenizer """
|
||||||
|
|
||||||
|
@property
|
||||||
|
def normalized(self) -> NormalizedString:
|
||||||
|
""" The NormalizedString """
|
||||||
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def ids(self) -> List[int]:
|
def ids(self) -> List[int]:
|
||||||
""" The tokenized ids """
|
""" The tokenized ids """
|
||||||
|
Reference in New Issue
Block a user