Created
February 17, 2023 16:29
-
-
Save pstch/81c28d81c9b4f4e04d6d55cc2c711067 to your computer and use it in GitHub Desktop.
compact serialization for mail-parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use serde::{Serialize, Deserialize}; | |
use std::borrow::Cow; | |
use mail_parser::{Message, MessagePart, MessagePartId, PartType, Header, Encoding as MimeEncoding, MimeHeaders}; | |
use mail_parser::decoders::{base64::base64_decode, quoted_printable::quoted_printable_decode}; | |
use encoding_rs::{Encoding as TextEncoding, UTF_8}; | |
// TODO: | |
// - add criterion benchmark, especially to compare deserialization+conversion | |
// to Message::parse | |
// - add error handling, especially in charset and MIME type handling | |
// - make Header fully owned at the type level, so that we can make CompactMessage | |
// self-contained (without any dynamic references) | |
// + currently, instances of our structures are always self-contained, because | |
// header is cloned when building CompactMessagePart, however this could be skipped | |
// using custom Header/HeaderName/HeaderValue types that can converted from/to the | |
// not-always-owning original types | |
// - provide a feature to allow to reduce heap allocations using smallvec, | |
// so that we can check in real usage if it brings any improvements | |
// - add a test harness that allows testing with a large number of emails in parallel | |
// + to quickly test large quantities of mail | |
// + to simulate real word conditions | |
// 0. Decoders | |
// =========================================================================== | |
fn charset_decode<'a>(data: Cow<'a, [u8]>, charset: &'static TextEncoding) -> Cow<'a, str> { | |
//charset.decode(&data).0 | |
Cow::from(charset.decode(&data).0.into_owned()) | |
} | |
fn mime_decode(data: Cow<[u8]>, encoding: MimeEncoding) -> Cow<[u8]> { | |
match encoding { | |
MimeEncoding::None => data, | |
MimeEncoding::Base64 => Cow::from(base64_decode(&data).unwrap()), | |
MimeEncoding::QuotedPrintable => Cow::from(quoted_printable_decode(&data).unwrap()), | |
} | |
} | |
// 1. Wrapping structures | |
// =========================================================================== | |
/// This structure is an analog to Message that does not contain | |
/// references to the message's body, useful to serialize the | |
/// message structure independently of its body. | |
/// | |
/// The only alteration are using CompactMessagePart rather than | |
/// MessagePart, and removing the raw_message field. | |
#[derive(Debug, PartialEq, Clone)] | |
#[derive(Serialize, Deserialize)] | |
pub struct CompactMessage<'a> { | |
pub html_body: Vec<MessagePartId>, | |
pub text_body: Vec<MessagePartId>, | |
pub attachments: Vec<MessagePartId>, | |
pub parts: Vec<CompactMessagePart<'a>>, | |
} | |
/// This structure is an analog to MessagePart that does not contain | |
/// references to the message's body, useful to serialize the | |
/// message structure independently of its body. | |
/// | |
/// The only alterations are adding a field for the text's charset, | |
/// making sure that the 'encoding' field is properly serialized, | |
/// and storing the part's type rather than a typed reference to its body. | |
#[derive(Debug, PartialEq, Clone)] | |
#[derive(Serialize, Deserialize)] | |
pub struct CompactMessagePart<'a> { | |
pub headers: Vec<Header<'a>>, | |
pub is_encoding_problem: bool, | |
pub part_type: CompactPartType<'a>, | |
pub charset: &'static TextEncoding, | |
pub encoding: MimeEncoding, | |
pub offset_header: usize, | |
pub offset_body: usize, | |
pub offset_end: usize, | |
} | |
/// This structure is an analog to PartType that does not contain | |
/// references to the message's body, useful to serialize the | |
/// message structure independently of its body. | |
/// | |
/// The only alterations are removing any values that directly | |
/// reference the source message body. | |
#[derive(Debug, PartialEq, Clone)] | |
#[derive(Serialize, Deserialize)] | |
pub enum CompactPartType<'a> { | |
Text, | |
Html, | |
Binary, | |
InlineBinary, | |
Message(CompactMessage<'a>), | |
Multipart(Vec<MessagePartId>), | |
} | |
// 2. Wrapping impls | |
// =========================================================================== | |
impl<'a, 'b> CompactMessage<'a> { | |
pub fn from_message(message: &'b Message<'b>) -> Self { | |
Self { | |
html_body: message.html_body.clone(), | |
text_body: message.text_body.clone(), | |
attachments: message.attachments.clone(), | |
parts: message.parts.iter().map(|x| | |
CompactMessagePart::from_message_part(x) | |
).collect(), | |
} | |
} | |
pub fn to_message(&'a self, raw_message: &'b [u8]) -> Message<'b> { | |
Message { | |
html_body: self.html_body.clone(), | |
text_body: self.text_body.clone(), | |
attachments: self.attachments.clone(), | |
parts: self.parts.iter().map(|x| | |
CompactMessagePart::to_message_part(x, raw_message.clone()) | |
).collect(), | |
raw_message: Cow::from(raw_message), | |
} | |
} | |
} | |
impl<'a, 'b> CompactMessagePart<'a> { | |
pub fn from_message_part(part: &'b MessagePart<'b>) -> Self { | |
Self { | |
headers: part.headers.clone().into_iter().map(|h| h.into_owned()).collect(), | |
is_encoding_problem: part.is_encoding_problem.clone(), | |
encoding: part.encoding.clone(), | |
part_type: CompactPartType::from_part_type(&part.body), | |
charset: TextEncoding::for_label(part.content_transfer_encoding().unwrap_or("").as_bytes()).unwrap_or(UTF_8), | |
offset_header: part.offset_header.clone(), | |
offset_body: part.offset_body.clone(), | |
offset_end: part.offset_end.clone(), | |
} | |
} | |
pub fn to_message_part(&self, raw_message: &'b [u8]) -> MessagePart<'b> { | |
let offsets = (self.offset_header, self.offset_body, self.offset_end); | |
MessagePart { | |
headers: self.headers.clone().into_iter().map(|h| h.into_owned()).collect(), | |
is_encoding_problem: self.is_encoding_problem, | |
encoding: self.encoding, | |
body: self.part_type.to_part_type(raw_message, offsets, self.encoding, self.charset), | |
offset_header: self.offset_header, | |
offset_body: self.offset_body, | |
offset_end: self.offset_end, | |
} | |
} | |
} | |
impl<'a, 'b> CompactPartType<'a> { | |
pub fn from_part_type(part_type: &'b PartType) -> Self { | |
match part_type { | |
PartType::Text(_) => Self::Text, | |
PartType::Html(_) => Self::Html, | |
PartType::Binary(_) => Self::Binary, | |
PartType::InlineBinary(_) => Self::InlineBinary, | |
PartType::Message(message) => Self::Message(CompactMessage::from_message(message)), | |
PartType::Multipart(parts) => Self::Multipart(parts.clone()), | |
} | |
} | |
pub fn to_part_type(&'a self, raw_message: &'b [u8], offsets: (usize, usize, usize), encoding: MimeEncoding, charset: &'static TextEncoding) -> PartType<'b> { | |
let (offset_header, offset_body, offset_end) = offsets; | |
let raw_slice = Cow::from(&raw_message[offset_body..offset_end]); | |
match self { | |
Self::Text => PartType::Text(charset_decode(mime_decode(raw_slice, encoding), charset)), | |
Self::Html => PartType::Html(charset_decode(mime_decode(raw_slice, encoding), charset)), | |
Self::Binary => PartType::Binary(mime_decode(raw_slice, encoding)), | |
Self::InlineBinary => PartType::InlineBinary(raw_slice), | |
Self::Message(message) => PartType::Message(CompactMessage::to_message(message, raw_message)), | |
Self::Multipart(parts) => PartType::Multipart(parts.clone()), | |
} | |
} | |
} | |
// 5. Entry points | |
// =========================================================================== | |
pub fn serialize_message_structure(message: &Message) -> Vec<u8> { | |
bincode::serialize(&CompactMessage::from_message(message)).unwrap() | |
} | |
pub fn deserialize_message_structure<'a, 'b>(data: &'a [u8], raw_message: &'b [u8]) -> Message<'b> { | |
CompactMessage::to_message(&bincode::deserialize(data).unwrap(), raw_message) | |
} | |
// 5. Tests and benchmarks | |
// =========================================================================== | |
mod tests { | |
use mail_parser::Message; | |
use crate::message::{serialize_message_structure, deserialize_message_structure}; | |
use super::CompactMessage; | |
#[test] | |
fn test_sample_message() { | |
let input = br#"From: Art Vandelay <[email protected]> (Vandelay Industries) | |
To: "Colleagues": "James Smythe" <[email protected]>; Friends: | |
[email protected], =?UTF-8?Q?John_Sm=C3=AEth?= <[email protected]>; | |
Date: Sat, 20 Nov 2021 14:22:01 -0800 | |
Subject: Why not both importing AND exporting? =?utf-8?b?4pi6?= | |
Content-Type: multipart/mixed; boundary="festivus"; | |
--festivus | |
Content-Type: text/html; charset="us-ascii" | |
Content-Transfer-Encoding: base64 | |
PGh0bWw+PHA+SSB3YXMgdGhpbmtpbmcgYWJvdXQgcXVpdHRpbmcgdGhlICZsZHF1bztle | |
HBvcnRpbmcmcmRxdW87IHRvIGZvY3VzIGp1c3Qgb24gdGhlICZsZHF1bztpbXBvcnRpbm | |
cmcmRxdW87LDwvcD48cD5idXQgdGhlbiBJIHRob3VnaHQsIHdoeSBub3QgZG8gYm90aD8 | |
gJiN4MjYzQTs8L3A+PC9odG1sPg== | |
--festivus | |
Content-Type: message/rfc822 | |
From: "Cosmo Kramer" <[email protected]> | |
Subject: Exporting my book about coffee tables | |
Content-Type: multipart/mixed; boundary="giddyup"; | |
--giddyup | |
Content-Type: text/plain; charset="utf-16" | |
Content-Transfer-Encoding: quoted-printable | |
=FF=FE=0C!5=D8"=DD5=D8)=DD5=D8-=DD =005=D8*=DD5=D8"=DD =005=D8"= | |
=DD5=D85=DD5=D8-=DD5=D8,=DD5=D8/=DD5=D81=DD =005=D8*=DD5=D86=DD = | |
=005=D8=1F=DD5=D8,=DD5=D8,=DD5=D8(=DD =005=D8-=DD5=D8)=DD5=D8"= | |
=DD5=D8=1E=DD5=D80=DD5=D8"=DD!=00 | |
--giddyup | |
Content-Type: image/gif; name*1="about "; name*0="Book "; | |
name*2*=utf-8''%e2%98%95 tables.gif | |
Content-Transfer-Encoding: Base64 | |
Content-Disposition: attachment | |
R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7 | |
--giddyup-- | |
--festivus-- | |
"#; | |
let original = Message::parse(input).unwrap(); | |
let serialized = serialize_message_structure(&original); | |
let message = deserialize_message_structure(&serialized, input); | |
assert_eq!(original, message); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment