mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-31 04:29:21 +00:00
Python - Add NormalizedString + doc/typings
This commit is contained in:
@ -1,6 +1,7 @@
|
||||
extern crate tokenizers as tk;
|
||||
|
||||
use crate::error::PyError;
|
||||
use crate::normalized_string::NormalizedString;
|
||||
use pyo3::prelude::*;
|
||||
use pyo3::types::*;
|
||||
use tk::tokenizer::PaddingDirection;
|
||||
@ -20,13 +21,8 @@ impl Encoding {
|
||||
#[pymethods]
|
||||
impl Encoding {
|
||||
#[getter]
|
||||
fn get_original(&self) -> String {
|
||||
self.encoding.get_normalized().get_original().to_owned()
|
||||
}
|
||||
|
||||
#[getter]
|
||||
fn get_normalized(&self) -> String {
|
||||
self.encoding.get_normalized().get().to_owned()
|
||||
fn get_normalized(&self) -> NormalizedString {
|
||||
NormalizedString::new(self.encoding.get_normalized().clone())
|
||||
}
|
||||
|
||||
#[args(kwargs = "**")]
|
||||
|
@ -2,6 +2,7 @@ mod decoders;
|
||||
mod encoding;
|
||||
mod error;
|
||||
mod models;
|
||||
mod normalized_string;
|
||||
mod normalizers;
|
||||
mod pre_tokenizers;
|
||||
mod processors;
|
||||
@ -70,6 +71,8 @@ fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
#[pymodule]
|
||||
fn tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
m.add_class::<tokenizer::Tokenizer>()?;
|
||||
m.add_class::<encoding::Encoding>()?;
|
||||
m.add_class::<normalized_string::NormalizedString>()?;
|
||||
m.add_wrapped(wrap_pymodule!(models))?;
|
||||
m.add_wrapped(wrap_pymodule!(pre_tokenizers))?;
|
||||
m.add_wrapped(wrap_pymodule!(decoders))?;
|
||||
|
35
bindings/python/src/normalized_string.rs
Normal file
35
bindings/python/src/normalized_string.rs
Normal file
@ -0,0 +1,35 @@
|
||||
extern crate tokenizers as tk;
|
||||
|
||||
use pyo3::prelude::*;
|
||||
|
||||
#[pyclass]
|
||||
#[repr(transparent)]
|
||||
pub struct NormalizedString {
|
||||
s: tk::tokenizer::NormalizedString,
|
||||
}
|
||||
impl NormalizedString {
|
||||
pub fn new(s: tk::tokenizer::NormalizedString) -> NormalizedString {
|
||||
NormalizedString { s }
|
||||
}
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl NormalizedString {
|
||||
#[getter]
|
||||
fn get_original(&self) -> String {
|
||||
self.s.get_original().to_owned()
|
||||
}
|
||||
|
||||
#[getter]
|
||||
fn get_normalized(&self) -> String {
|
||||
self.s.get().to_owned()
|
||||
}
|
||||
|
||||
fn get_range(&self, start: usize, end: usize) -> Option<String> {
|
||||
self.s.get_range(start..end).map(|s| s.to_owned())
|
||||
}
|
||||
|
||||
fn get_range_original(&self, start: usize, end: usize) -> Option<String> {
|
||||
self.s.get_range_original(start..end).map(|s| s.to_owned())
|
||||
}
|
||||
}
|
@ -9,10 +9,62 @@ from typing import Optional, Union, List, Tuple
|
||||
|
||||
Offsets = Tuple[int, int]
|
||||
|
||||
class NormalizedString:
|
||||
""" A NormalizedString produced during normalization """
|
||||
|
||||
@property
|
||||
def original(self) -> str:
|
||||
""" The original string """
|
||||
pass
|
||||
|
||||
@property
|
||||
def normalized(self) -> str:
|
||||
""" The normalized string """
|
||||
pass
|
||||
|
||||
def get_range(self, start: int, end: int) -> Optional[str]:
|
||||
""" Return a range of the normalized string, if the bounds are correct
|
||||
|
||||
Args:
|
||||
start: int:
|
||||
The starting offset in the string
|
||||
|
||||
end: int:
|
||||
The ending offset in the string
|
||||
|
||||
Returns:
|
||||
The substring if the bounds are correct
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_range_original(self, start: int, end: int) -> Optional[str]:
|
||||
""" Return a range of the original string, if the bounds are correct
|
||||
|
||||
The given bounds are supposed to be after-normalization-offsets.
|
||||
Provided with the `Encoding.offsets` associated with an `Encoding.ids` unit,
|
||||
this method will return the part of the original string corresponding to the id.
|
||||
|
||||
Args:
|
||||
start: int:
|
||||
The starting offset in the normalized string
|
||||
|
||||
end: int:
|
||||
The ending offset in the normalized string
|
||||
|
||||
Returns:
|
||||
The substring if the bounds are correct
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class Encoding:
|
||||
""" An Encoding as returned by the Tokenizer """
|
||||
|
||||
@property
|
||||
def normalized(self) -> NormalizedString:
|
||||
""" The NormalizedString """
|
||||
pass
|
||||
|
||||
@property
|
||||
def ids(self) -> List[int]:
|
||||
""" The tokenized ids """
|
||||
|
Reference in New Issue
Block a user