Implement impl_serde_type macro

This commit is contained in:
Mishig Davaadorj
2022-01-22 22:53:02 +01:00
parent a8e07d734f
commit 9a9c70563a
12 changed files with 330 additions and 136 deletions

View File

@ -57,6 +57,8 @@ dirs = "3.0"
reqwest = { version = "0.11", optional = true } reqwest = { version = "0.11", optional = true }
cached-path = { version = "0.5", optional = true } cached-path = { version = "0.5", optional = true }
aho-corasick = "0.7" aho-corasick = "0.7"
paste = "1.0.6"
proc_macros = { path = "./src/utils/proc_macros" }
[features] [features]
default = ["progressbar", "http"] default = ["progressbar", "http"]

View File

@ -1,12 +1,13 @@
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
use onig::Regex; use onig::Regex;
use serde::{Deserialize, Deserializer, Serialize}; use serde::{Deserialize, Serialize};
use crate::tokenizer::{ use crate::tokenizer::{
Decoder, Encoding, PostProcessor, PreTokenizedString, PreTokenizer, Result, Decoder, Encoding, PostProcessor, PreTokenizedString, PreTokenizer, Result,
SplitDelimiterBehavior, SplitDelimiterBehavior,
}; };
use crate::utils::macro_rules_attribute;
fn bytes_char() -> HashMap<u8, char> { fn bytes_char() -> HashMap<u8, char> {
let mut bs: Vec<u8> = vec![]; let mut bs: Vec<u8> = vec![];
@ -40,11 +41,11 @@ lazy_static! {
bytes_char().into_iter().map(|(c, b)| (b, c)).collect(); bytes_char().into_iter().map(|(c, b)| (b, c)).collect();
} }
#[derive(Serialize, Copy, Clone, Debug, PartialEq)] #[derive(Copy, Clone, Debug, PartialEq)]
/// Provides all the necessary steps to handle the BPE tokenization at the byte-level. Takes care /// Provides all the necessary steps to handle the BPE tokenization at the byte-level. Takes care
/// of all the required processing steps to transform a UTF-8 string as needed before and after the /// of all the required processing steps to transform a UTF-8 string as needed before and after the
/// BPE model does its job. /// BPE model does its job.
#[serde(tag = "type")] #[macro_rules_attribute(impl_serde_type!)]
#[non_exhaustive] #[non_exhaustive]
pub struct ByteLevel { pub struct ByteLevel {
/// Whether to add a leading space to the first word. This allows to treat the leading word /// Whether to add a leading space to the first word. This allows to treat the leading word
@ -54,29 +55,6 @@ pub struct ByteLevel {
pub trim_offsets: bool, pub trim_offsets: bool,
} }
impl<'de> Deserialize<'de> for ByteLevel {
fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
where
D: Deserializer<'de>,
{
#[derive(Deserialize)]
enum Type {
ByteLevel,
}
#[derive(Deserialize)]
pub struct ByteLevelHelper {
#[serde(rename = "type")]
_type: Type,
add_prefix_space: bool,
trim_offsets: bool,
}
let helper = ByteLevelHelper::deserialize(deserializer)?;
Ok(ByteLevel::new(helper.add_prefix_space, helper.trim_offsets))
}
}
impl Default for ByteLevel { impl Default for ByteLevel {
fn default() -> Self { fn default() -> Self {
Self { Self {

View File

@ -1,36 +1,15 @@
use serde::{Deserialize, Deserializer, Serialize}; use serde::{Deserialize, Serialize};
use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior}; use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior};
use crate::utils::macro_rules_attribute;
#[derive(Copy, Clone, Debug, Serialize, PartialEq)] #[derive(Copy, Clone, Debug, PartialEq)]
#[serde(tag = "type")]
#[non_exhaustive] #[non_exhaustive]
#[macro_rules_attribute(impl_serde_type!)]
pub struct CharDelimiterSplit { pub struct CharDelimiterSplit {
pub delimiter: char, pub delimiter: char,
} }
impl<'de> Deserialize<'de> for CharDelimiterSplit {
fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
where
D: Deserializer<'de>,
{
#[derive(Deserialize)]
enum Type {
CharDelimiterSplit,
}
#[derive(Deserialize)]
pub struct CharDelimiterSplitHelper {
#[serde(rename = "type")]
_type: Type,
delimiter: char,
}
let helper = CharDelimiterSplitHelper::deserialize(deserializer)?;
Ok(CharDelimiterSplit::new(helper.delimiter))
}
}
impl CharDelimiterSplit { impl CharDelimiterSplit {
pub fn new(delimiter: char) -> Self { pub fn new(delimiter: char) -> Self {
CharDelimiterSplit { delimiter } CharDelimiterSplit { delimiter }

View File

@ -1,38 +1,17 @@
use serde::{Deserialize, Deserializer, Serialize}; use serde::{Deserialize, Serialize};
use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior}; use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior};
use crate::utils::macro_rules_attribute;
#[derive(Serialize, Clone, Debug, PartialEq)] #[derive(Clone, Debug, PartialEq)]
/// Pre tokenizes the numbers into single tokens. If individual_digits is set /// Pre tokenizes the numbers into single tokens. If individual_digits is set
/// to true, then all digits are splitted into individual tokens. /// to true, then all digits are splitted into individual tokens.
#[serde(tag = "type")]
#[non_exhaustive] #[non_exhaustive]
#[macro_rules_attribute(impl_serde_type!)]
pub struct Digits { pub struct Digits {
pub individual_digits: bool, pub individual_digits: bool,
} }
impl<'de> Deserialize<'de> for Digits {
fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
where
D: Deserializer<'de>,
{
#[derive(Deserialize)]
enum Type {
Digits,
}
#[derive(Deserialize)]
pub struct DigitsHelper {
#[serde(rename = "type")]
_type: Type,
individual_digits: bool,
}
let helper = DigitsHelper::deserialize(deserializer)?;
Ok(Digits::new(helper.individual_digits))
}
}
impl Digits { impl Digits {
pub fn new(individual_digits: bool) -> Self { pub fn new(individual_digits: bool) -> Self {
Self { individual_digits } Self { individual_digits }

View File

@ -24,13 +24,11 @@ impl<'de> Deserialize<'de> for Metaspace {
} }
#[derive(Deserialize)] #[derive(Deserialize)]
pub struct MetaspaceHelper { struct MetaspaceHelper {
#[serde(rename = "type")] #[allow(dead_code)]
_type: Type, r#type: Type,
replacement: char, replacement: char,
pub add_prefix_space: bool, add_prefix_space: bool,
#[serde(skip, rename = "str_rep")]
_str_rep: String,
} }
let helper = MetaspaceHelper::deserialize(deserializer)?; let helper = MetaspaceHelper::deserialize(deserializer)?;

View File

@ -1,41 +1,20 @@
use serde::{Deserialize, Deserializer, Serialize}; use serde::{Deserialize, Serialize};
use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior}; use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior};
use crate::utils::macro_rules_attribute;
use unicode_categories::UnicodeCategories; use unicode_categories::UnicodeCategories;
fn is_punc(x: char) -> bool { fn is_punc(x: char) -> bool {
char::is_ascii_punctuation(&x) || x.is_punctuation() char::is_ascii_punctuation(&x) || x.is_punctuation()
} }
#[derive(Serialize, Copy, Clone, Debug, PartialEq)] #[derive(Copy, Clone, Debug, PartialEq)]
#[serde(tag = "type")] #[macro_rules_attribute(impl_serde_type!)]
pub struct Punctuation { pub struct Punctuation {
#[serde(default = "default_split")]
behavior: SplitDelimiterBehavior, behavior: SplitDelimiterBehavior,
} }
impl<'de> Deserialize<'de> for Punctuation {
fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
where
D: Deserializer<'de>,
{
#[derive(Deserialize)]
enum Type {
Punctuation,
}
#[derive(Deserialize)]
pub struct PunctuationHelper {
#[serde(rename = "type")]
_type: Type,
#[serde(default = "default_split")]
behavior: SplitDelimiterBehavior,
}
let helper = PunctuationHelper::deserialize(deserializer)?;
Ok(Punctuation::new(helper.behavior))
}
}
fn default_split() -> SplitDelimiterBehavior { fn default_split() -> SplitDelimiterBehavior {
SplitDelimiterBehavior::Isolated SplitDelimiterBehavior::Isolated
} }

View File

@ -1,35 +1,14 @@
use crate::pre_tokenizers::PreTokenizerWrapper; use crate::pre_tokenizers::PreTokenizerWrapper;
use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result}; use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result};
use serde::{Deserialize, Deserializer, Serialize}; use crate::utils::macro_rules_attribute;
use serde::{Deserialize, Serialize};
#[derive(Clone, Debug, Serialize, PartialEq)] #[derive(Clone, Debug, PartialEq)]
#[serde(tag = "type")] #[macro_rules_attribute(impl_serde_type!)]
pub struct Sequence { pub struct Sequence {
pretokenizers: Vec<PreTokenizerWrapper>, pretokenizers: Vec<PreTokenizerWrapper>,
} }
impl<'de> Deserialize<'de> for Sequence {
fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
where
D: Deserializer<'de>,
{
#[derive(Deserialize)]
enum Type {
Sequence,
}
#[derive(Deserialize)]
pub struct SequenceHelper {
#[serde(rename = "type")]
_type: Type,
pretokenizers: Vec<PreTokenizerWrapper>,
}
let helper = SequenceHelper::deserialize(deserializer)?;
Ok(Sequence::new(helper.pretokenizers))
}
}
impl Sequence { impl Sequence {
pub fn new(pretokenizers: Vec<PreTokenizerWrapper>) -> Self { pub fn new(pretokenizers: Vec<PreTokenizerWrapper>) -> Self {
Self { pretokenizers } Self { pretokenizers }

View File

@ -45,9 +45,9 @@ impl<'de> Deserialize<'de> for Split {
} }
#[derive(Deserialize)] #[derive(Deserialize)]
pub struct SplitHelper { struct SplitHelper {
#[serde(rename = "type")] #[allow(dead_code)]
_type: Type, r#type: Type,
pattern: SplitPattern, pattern: SplitPattern,
behavior: SplitDelimiterBehavior, behavior: SplitDelimiterBehavior,
invert: bool, invert: bool,

View File

@ -74,3 +74,110 @@ macro_rules! impl_serde_unit_struct (
} }
} }
); );
/// Implement `serde::{Serialize, Serializer}` with `#[serde(tag = "type")]` attribute for a given struct.
/// Panic when a json string being deserilized misses field `type`.
///
/// # Examples
///
/// ```
/// # #[macro_use] extern crate tokenizers;
/// use serde::{Serialize, Deserialize};
///
/// fn main() {
/// impl_serde_type!{
/// #[derive(Debug)]
/// struct Point {
/// x: i32,
/// #[serde(default = "default_y")]
/// y: i32,
/// }
/// }
/// fn default_y() -> i32 {
/// 5
/// }
///
/// let point = Point { x: 1, y: 2 };
/// let serialized_s = r#"{"type":"Point","x":1,"y":2}"#;
/// assert_eq!(serde_json::to_string(&point).unwrap(), serialized_s);
/// }
/// ```
///
/// ```should_panic
/// # #[macro_use] extern crate tokenizers;
/// use serde::{Serialize, Deserialize};
///
/// fn main() {
/// impl_serde_type!{
/// #[derive(Debug)]
/// struct Point1D {
/// x: i32,
/// }
/// }
///
/// let serialized_s = r#"{"x":1}"#;
/// let deserialized: Point1D = serde_json::from_str(serialized_s).unwrap();
/// }
/// ```
#[macro_export]
macro_rules! impl_serde_type{
(
$(#[$meta:meta])*
$vis:vis struct $struct_name:ident {
$(
$(#[$field_meta:meta])*
$field_vis:vis $field_name:ident : $field_type:ty
),*$(,)+
}
) => {
use paste::paste;
paste!{
$(#[$meta])*
#[derive(Serialize, Deserialize)]
#[serde(tag = "type", from = $struct_name "Deserilaizer")]
$vis struct $struct_name{
$(
$(#[$field_meta])*
$field_vis $field_name : $field_type,
)*
}
#[doc(hidden)]
$(#[$meta])*
#[derive(Deserialize)]
#[serde(tag = "type", remote = $struct_name "")]
struct [<$struct_name Def>]{
$(
$(#[$field_meta])*
$field_vis $field_name : $field_type,
)*
}
#[doc(hidden)]
#[derive(Deserialize)]
enum [<$struct_name Type>] {
$struct_name,
}
#[doc(hidden)]
#[derive(Deserialize)]
struct [<$struct_name Deserilaizer>] {
#[allow(dead_code)]
r#type: [<$struct_name Type>],
#[serde(flatten, with = $struct_name "Def")]
r#struct: $struct_name,
}
#[doc(hidden)]
impl std::convert::From<[<$struct_name Deserilaizer>]> for $struct_name {
fn from(v: [<$struct_name Deserilaizer>]) -> Self {
v.r#struct
}
}
}
}
}
// Re-export macro_rules_attribute
pub use proc_macros::macro_rules_attribute;

View File

@ -0,0 +1,11 @@
[package]
name = "proc_macros"
version = "0.1.0"
edition = "2018"
[lib]
proc-macro = true
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]

View File

@ -0,0 +1 @@
Todod here

View File

@ -0,0 +1,181 @@
//! Do not use this crate directly. Instead, use [`::macro_rules_attribute`](
//! https://docs.rs/macro_rules_attribute)
extern crate proc_macro;
use ::proc_macro::*;
/// Applies the given `macro_rules!` macro to the decorated item.
///
/// This, as with any `proc_macro_attribute`, **consumes** the item it
/// decorates: it is the `macro_rules!` macro job to generate it (_it is thus
/// able to modify it_!).
///
/// For a version with "read-only" access to the item it decorates, see
/// [`macro_rules_derive`][`macro@macro_rules_derive`].
///
/// # Example
///
/// Deriving getters for a (non-generic) `struct`:
///
/// ```rust
/// # macro_rules! ignore {($($tt:tt)*) => () }
/// # ignore! {
/// #[macro_use]
/// extern crate macro_rules_attribute;
/// # }
///
/// macro_rules! make_getters {(
/// $(#[$struct_meta:meta])*
/// $struct_vis:vis
/// struct $StructName:ident {
/// $(
/// $(#[$field_meta:meta])*
/// $field_vis:vis // this visibility will be applied to the getters instead
/// $field_name:ident : $field_ty:ty
/// ),* $(,)?
/// }
/// ) => (
/// // First, generate the struct definition we have been given, but with
/// // private fields instead.
/// $(#[$struct_meta])*
/// $struct_vis
/// struct $StructName {
/// $(
/// $(#[$field_meta])*
/// // notice the lack of visibility => private fields
/// $field_name: $field_ty,
/// )*
/// }
///
/// // Then, implement the getters:
/// impl $StructName {
/// $(
/// #[inline]
/// $field_vis
/// fn $field_name (self: &'_ Self)
/// -> &'_ $field_ty
/// {
/// &self.$field_name
/// }
/// )*
/// }
/// )}
///
/// mod example {
/// # use ::macro_rules_attribute_proc_macro::macro_rules_attribute;
/// #[macro_rules_attribute(make_getters!)]
/// /// The macro handles meta attributes such as docstrings
/// pub
/// struct Person {
/// pub
/// name: String,
///
/// pub
/// age: u8,
/// }
/// }
/// use example::Person;
///
/// fn is_new_born (person: &'_ Person)
/// -> bool
/// {
/// // person.age == 0
/// // ^ error[E0616]: field `age` of struct `example::Person` is private
/// *person.age() == 0
/// }
/// ```
#[proc_macro_attribute] pub
fn macro_rules_attribute (
attrs: TokenStream,
input: TokenStream,
) -> TokenStream
{
// check that `attrs` is indeed of the form `$macro_name:path !`
{
// FIXME: do this properly
match attrs.clone().into_iter().last() {
| Some(TokenTree::Punct(ref punct))
if punct.as_char() == '!'
=> {},
| _ => {
panic!("Expected a parameter of the form `macro_name !`");
},
}
}
let mut ret = attrs;
ret.extend(::std::iter::once(
TokenTree::Group(Group::new(
Delimiter::Brace,
// FIXME: directly using `input` makes the token stream be seen
// as a single token tree by the declarative macro !??
input.into_iter().collect(),
))
));
#[cfg(feature = "verbose-expansions")]
eprintln!("{}", ret);
ret
}
/// Applies the given `macro_rules!` macro to the decorated item.
///
/// This, as with any `#[derive(...)]`, **does not consume** the item it
/// decorates: instead, it only generates code on top of it.
///
/// # Example
///
/// Implementing `Into<Int>` for a given `#[repr(Int)]` `enum`:
///
/// ```rust
/// # macro_rules! ignore {($($tt:tt)*) => () }
/// # ignore! {
/// #[macro_use]
/// extern crate macro_rules_attribute;
/// # }
///
/// macro_rules! ToInteger {(
/// #[repr($Int:ident)]
/// $(#[$enum_meta:meta])*
/// $pub:vis
/// enum $Enum:ident {
/// $(
/// $Variant:ident $(= $value:expr)?
/// ),* $(,)?
/// }
/// ) => (
/// impl ::core::convert::From<$Enum> for $Int {
/// #[inline]
/// fn from (x: $Enum)
/// -> Self
/// {
/// x as _
/// }
/// }
/// )}
///
/// # use ::macro_rules_attribute_proc_macro::macro_rules_derive;
/// #[macro_rules_derive(ToInteger!)]
/// #[repr(u32)]
/// enum Bool {
/// False,
/// True,
/// }
///
/// fn main ()
/// {
/// assert_eq!(u32::from(Bool::False), 0);
/// assert_eq!(u32::from(Bool::True), 1);
/// // assert_eq!(u8::from(Bool::False), 0);
/// // ^ error[E0277]: the trait bound `u8: std::convert::From<main::Bool>` is not satisfied
/// }
/// ```
#[proc_macro_attribute] pub
fn macro_rules_derive (
attrs: TokenStream,
input: TokenStream,
) -> TokenStream
{
let mut ret = input.clone();
ret.extend(macro_rules_attribute(attrs, input));
ret
}