use serde::{Serialize, Deserialize}; use std::borrow::Cow; use mail_parser::{Message, MessagePart, MessagePartId, PartType, Header, Encoding as MimeEncoding, MimeHeaders}; use mail_parser::decoders::{base64::base64_decode, quoted_printable::quoted_printable_decode}; use encoding_rs::{Encoding as TextEncoding, UTF_8}; // TODO: // - add criterion benchmark, especially to compare deserialization+conversion // to Message::parse // - add error handling, especially in charset and MIME type handling // - make Header fully owned at the type level, so that we can make CompactMessage // self-contained (without any dynamic references) // + currently, instances of our structures are always self-contained, because // header is cloned when building CompactMessagePart, however this could be skipped // using custom Header/HeaderName/HeaderValue types that can converted from/to the // not-always-owning original types // - provide a feature to allow to reduce heap allocations using smallvec, // so that we can check in real usage if it brings any improvements // - add a test harness that allows testing with a large number of emails in parallel // + to quickly test large quantities of mail // + to simulate real word conditions // 0. Decoders // =========================================================================== fn charset_decode<'a>(data: Cow<'a, [u8]>, charset: &'static TextEncoding) -> Cow<'a, str> { //charset.decode(&data).0 Cow::from(charset.decode(&data).0.into_owned()) } fn mime_decode(data: Cow<[u8]>, encoding: MimeEncoding) -> Cow<[u8]> { match encoding { MimeEncoding::None => data, MimeEncoding::Base64 => Cow::from(base64_decode(&data).unwrap()), MimeEncoding::QuotedPrintable => Cow::from(quoted_printable_decode(&data).unwrap()), } } // 1. Wrapping structures // =========================================================================== /// This structure is an analog to Message that does not contain /// references to the message's body, useful to serialize the /// message structure independently of its body. /// /// The only alteration are using CompactMessagePart rather than /// MessagePart, and removing the raw_message field. #[derive(Debug, PartialEq, Clone)] #[derive(Serialize, Deserialize)] pub struct CompactMessage<'a> { pub html_body: Vec<MessagePartId>, pub text_body: Vec<MessagePartId>, pub attachments: Vec<MessagePartId>, pub parts: Vec<CompactMessagePart<'a>>, } /// This structure is an analog to MessagePart that does not contain /// references to the message's body, useful to serialize the /// message structure independently of its body. /// /// The only alterations are adding a field for the text's charset, /// making sure that the 'encoding' field is properly serialized, /// and storing the part's type rather than a typed reference to its body. #[derive(Debug, PartialEq, Clone)] #[derive(Serialize, Deserialize)] pub struct CompactMessagePart<'a> { pub headers: Vec<Header<'a>>, pub is_encoding_problem: bool, pub part_type: CompactPartType<'a>, pub charset: &'static TextEncoding, pub encoding: MimeEncoding, pub offset_header: usize, pub offset_body: usize, pub offset_end: usize, } /// This structure is an analog to PartType that does not contain /// references to the message's body, useful to serialize the /// message structure independently of its body. /// /// The only alterations are removing any values that directly /// reference the source message body. #[derive(Debug, PartialEq, Clone)] #[derive(Serialize, Deserialize)] pub enum CompactPartType<'a> { Text, Html, Binary, InlineBinary, Message(CompactMessage<'a>), Multipart(Vec<MessagePartId>), } // 2. Wrapping impls // =========================================================================== impl<'a, 'b> CompactMessage<'a> { pub fn from_message(message: &'b Message<'b>) -> Self { Self { html_body: message.html_body.clone(), text_body: message.text_body.clone(), attachments: message.attachments.clone(), parts: message.parts.iter().map(|x| CompactMessagePart::from_message_part(x) ).collect(), } } pub fn to_message(&'a self, raw_message: &'b [u8]) -> Message<'b> { Message { html_body: self.html_body.clone(), text_body: self.text_body.clone(), attachments: self.attachments.clone(), parts: self.parts.iter().map(|x| CompactMessagePart::to_message_part(x, raw_message.clone()) ).collect(), raw_message: Cow::from(raw_message), } } } impl<'a, 'b> CompactMessagePart<'a> { pub fn from_message_part(part: &'b MessagePart<'b>) -> Self { Self { headers: part.headers.clone().into_iter().map(|h| h.into_owned()).collect(), is_encoding_problem: part.is_encoding_problem.clone(), encoding: part.encoding.clone(), part_type: CompactPartType::from_part_type(&part.body), charset: TextEncoding::for_label(part.content_transfer_encoding().unwrap_or("").as_bytes()).unwrap_or(UTF_8), offset_header: part.offset_header.clone(), offset_body: part.offset_body.clone(), offset_end: part.offset_end.clone(), } } pub fn to_message_part(&self, raw_message: &'b [u8]) -> MessagePart<'b> { let offsets = (self.offset_header, self.offset_body, self.offset_end); MessagePart { headers: self.headers.clone().into_iter().map(|h| h.into_owned()).collect(), is_encoding_problem: self.is_encoding_problem, encoding: self.encoding, body: self.part_type.to_part_type(raw_message, offsets, self.encoding, self.charset), offset_header: self.offset_header, offset_body: self.offset_body, offset_end: self.offset_end, } } } impl<'a, 'b> CompactPartType<'a> { pub fn from_part_type(part_type: &'b PartType) -> Self { match part_type { PartType::Text(_) => Self::Text, PartType::Html(_) => Self::Html, PartType::Binary(_) => Self::Binary, PartType::InlineBinary(_) => Self::InlineBinary, PartType::Message(message) => Self::Message(CompactMessage::from_message(message)), PartType::Multipart(parts) => Self::Multipart(parts.clone()), } } pub fn to_part_type(&'a self, raw_message: &'b [u8], offsets: (usize, usize, usize), encoding: MimeEncoding, charset: &'static TextEncoding) -> PartType<'b> { let (offset_header, offset_body, offset_end) = offsets; let raw_slice = Cow::from(&raw_message[offset_body..offset_end]); match self { Self::Text => PartType::Text(charset_decode(mime_decode(raw_slice, encoding), charset)), Self::Html => PartType::Html(charset_decode(mime_decode(raw_slice, encoding), charset)), Self::Binary => PartType::Binary(mime_decode(raw_slice, encoding)), Self::InlineBinary => PartType::InlineBinary(raw_slice), Self::Message(message) => PartType::Message(CompactMessage::to_message(message, raw_message)), Self::Multipart(parts) => PartType::Multipart(parts.clone()), } } } // 5. Entry points // =========================================================================== pub fn serialize_message_structure(message: &Message) -> Vec<u8> { bincode::serialize(&CompactMessage::from_message(message)).unwrap() } pub fn deserialize_message_structure<'a, 'b>(data: &'a [u8], raw_message: &'b [u8]) -> Message<'b> { CompactMessage::to_message(&bincode::deserialize(data).unwrap(), raw_message) } // 5. Tests and benchmarks // =========================================================================== mod tests { use mail_parser::Message; use crate::message::{serialize_message_structure, deserialize_message_structure}; use super::CompactMessage; #[test] fn test_sample_message() { let input = br#"From: Art Vandelay <art@vandelay.com> (Vandelay Industries) To: "Colleagues": "James Smythe" <james@vandelay.com>; Friends: jane@example.com, =?UTF-8?Q?John_Sm=C3=AEth?= <john@example.com>; Date: Sat, 20 Nov 2021 14:22:01 -0800 Subject: Why not both importing AND exporting? =?utf-8?b?4pi6?= Content-Type: multipart/mixed; boundary="festivus"; --festivus Content-Type: text/html; charset="us-ascii" Content-Transfer-Encoding: base64 PGh0bWw+PHA+SSB3YXMgdGhpbmtpbmcgYWJvdXQgcXVpdHRpbmcgdGhlICZsZHF1bztle HBvcnRpbmcmcmRxdW87IHRvIGZvY3VzIGp1c3Qgb24gdGhlICZsZHF1bztpbXBvcnRpbm cmcmRxdW87LDwvcD48cD5idXQgdGhlbiBJIHRob3VnaHQsIHdoeSBub3QgZG8gYm90aD8 gJiN4MjYzQTs8L3A+PC9odG1sPg== --festivus Content-Type: message/rfc822 From: "Cosmo Kramer" <kramer@kramerica.com> Subject: Exporting my book about coffee tables Content-Type: multipart/mixed; boundary="giddyup"; --giddyup Content-Type: text/plain; charset="utf-16" Content-Transfer-Encoding: quoted-printable =FF=FE=0C!5=D8"=DD5=D8)=DD5=D8-=DD =005=D8*=DD5=D8"=DD =005=D8"= =DD5=D85=DD5=D8-=DD5=D8,=DD5=D8/=DD5=D81=DD =005=D8*=DD5=D86=DD = =005=D8=1F=DD5=D8,=DD5=D8,=DD5=D8(=DD =005=D8-=DD5=D8)=DD5=D8"= =DD5=D8=1E=DD5=D80=DD5=D8"=DD!=00 --giddyup Content-Type: image/gif; name*1="about "; name*0="Book "; name*2*=utf-8''%e2%98%95 tables.gif Content-Transfer-Encoding: Base64 Content-Disposition: attachment R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7 --giddyup-- --festivus-- "#; let original = Message::parse(input).unwrap(); let serialized = serialize_message_structure(&original); let message = deserialize_message_structure(&serialized, input); assert_eq!(original, message); } }