Skip to content

Instantly share code, notes, and snippets.

@mryhryki
Last active July 28, 2022 07:14
Show Gist options
  • Save mryhryki/13775c8b8cdc564589d66343473cdb83 to your computer and use it in GitHub Desktop.
Save mryhryki/13775c8b8cdc564589d66343473cdb83 to your computer and use it in GitHub Desktop.
Convert mbox to eml files by Node.js

Convert mbox to eml files by Node.js

Setup

  1. Install Node.js
  2. Run npm install
  3. Copy mbox file on this directory.
    • $ cp /path/to/your/mbox/file ./

Execute

# Defines your mbox file path and output directory path
$ export MBOX_FILE="/path/to/your/mbox/file"
$ export OUTPUT_DIR="/path/to/output/directory"

$ npm start
package-lock.json
node_modules/**
index.js
export const EmlFileMaxSize = 100 * 1024 * 1024; // 100MB
export const FromPattern = "\nFrom ";
export const getMboxFile = (): string => getEnv("MBOX_FILE")
export const getOutputDir = (): string => getEnv("OUTPUT_DIR")
const getEnv = (name: string): string => {
const value = process.env[name];
if (value != null) {
return value
}
throw new Error(`"${name}" environment variable is not set`);
}
import { ParsedMail, simpleParser } from "mailparser";
import { getAddressText } from "./util";
export const parseEmlData = (emlData: Buffer): Promise<ParsedMail> => new Promise((resolve, reject) => {
simpleParser(emlData, {/* options */ }, (err, mail) => {
if (err) {
reject(`Error parsing email: ${err}`);
} else {
resolve(mail);
}
});
});
export const toJsonText = (mail: ParsedMail): string => JSON.stringify({
messageId: mail.messageId,
date: mail.date,
subject: mail.subject,
to: getAddressText(mail.to),
from: mail.from?.value ?? null,
cc: getAddressText(mail.cc),
replyTo: mail.replyTo?.value ?? null,
headers: mail.headerLines.reduce((header, { key, line }) => ({ ...header, [key]: line }), {}),
attachments: (mail.attachments ?? []).map((attachment) => ({ ...attachment, content: undefined })),
html: String(mail.html ?? "").split(/\r?\n/g),
text: String(mail.text ?? "").split(/\r?\n/g),
}, null, 2)
import fsPromises from "node:fs/promises";
import path from "node:path";
export const makeDirectory = async(dirPath: string): Promise<void> => {
await fsPromises.mkdir(dirPath, {recursive: true})
}
export const existsFile = (filePath: string): Promise<boolean> =>
fsPromises.stat(filePath).then(() => true).catch(() => false)
export const writeFile = async(filePath: string, data: string | Buffer): Promise<void> => {
await makeDirectory(path.dirname(filePath))
await fsPromises.writeFile(filePath, data)
}
import fs from "node:fs";
import path from "node:path";
import { getOutputFilePath, digestSha256, toDisplay } from "./util";
import { EmlFileMaxSize, FromPattern, getMboxFile, getOutputDir } from "./config";
import { existsFile, writeFile } from "./fs";
import { parseEmlData, toJsonText } from "./email";
const onDetectionEml = async (eml: Buffer, baseDirPath: string): Promise<void> => {
const sha256 = digestSha256(eml);
const checkPath = path.resolve(baseDirPath, "exists", `${sha256.slice(0, 2)}/${sha256.slice(2)}`);
if (await existsFile(checkPath)) {
return;
}
const mail = await parseEmlData(eml);
await writeFile(`${path.resolve(baseDirPath, "mail", `${getOutputFilePath(mail)}`)}.eml`, eml);
await writeFile(`${path.resolve(baseDirPath, "json", `${getOutputFilePath(mail)}`)}.json`, toJsonText(mail));
await writeFile(checkPath, "");
};
const main = async (): Promise<void> => {
const MboxFilePath = getMboxFile();
const OutputDir = getOutputDir();
const stream = fs.createReadStream(MboxFilePath, { start: 0 });
let readBytes = 0;
let emlCount = 0;
let minEmlSize = Number.MAX_VALUE;
let maxEmlSize = Number.MIN_VALUE;
let buf: Buffer = Buffer.from([]);
stream.on("data", (chunk: Buffer | string) => {
readBytes += chunk.length;
buf = Buffer.concat([buf, Buffer.from(chunk)]);
if (buf.length > EmlFileMaxSize) throw new Error(`Eml size too large: >= ${buf.length} Bytes`);
let fromPosition = -1;
while (true) {
fromPosition = buf.indexOf(FromPattern);
if (fromPosition === -1) break;
if (++emlCount % 1000 === 0) {
const bytes = toDisplay(readBytes + fromPosition, 15);
const files = toDisplay(emlCount, 8);
const average = toDisplay(Math.round((readBytes + fromPosition) / emlCount), 11);
console.log(`Read: ${files} files, ${bytes} bytes, ${average} bytes/file (average)`);
}
if (fromPosition < minEmlSize) minEmlSize = fromPosition;
if (fromPosition > maxEmlSize) maxEmlSize = fromPosition;
onDetectionEml(buf.slice(0, fromPosition), OutputDir);
buf = buf.slice(fromPosition + 1);
}
});
stream.on("end", () => {
onDetectionEml(buf, OutputDir);
console.log(`END: Total ${toDisplay(++emlCount)} files, Total ${toDisplay(readBytes)} bytes, Min: ${toDisplay(minEmlSize)} bytes, Max: ${toDisplay(maxEmlSize)} bytes`);
});
};
main();
{
"name": "convert-mbox-to-eml-files-by-nodejs",
"author": "mryhryki",
"private": true,
"license": "MIT",
"scripts": {
"start": "esbuild --platform=node --target=node16 --external:node:* --bundle --minify ./index.ts | node",
"dev": "nodemon --watch ./ --ext ts --exec 'npm start'",
"build": "esbuild --platform=node --target=node16 --external:node:* --bundle --minify --outfile=./index.js ./index.ts",
"lint": "mryhryki-lint",
"lint:fix": "mryhryki-lint-fix",
"type": "tsc",
"type:watch": "tsc --watch"
},
"dependencies": {
"@mryhryki/lint": "^0.0.8",
"@types/mailparser": "^3.4.0",
"@types/node": "^17.0.42",
"dayjs": "^1.11.3",
"esbuild": "^0.14.43",
"mailparser": "^3.5.0",
"typescript": "^4.7.3"
},
"homepage": "https://gist.github.com/13775c8b8cdc564589d66343473cdb83"
}
{
"compilerOptions": {
"allowJs": false,
"allowSyntheticDefaultImports": true,
"esModuleInterop": true,
"jsx": "preserve",
"lib": ["esnext", "DOM.Iterable", "dom"],
"noEmit": true,
"noImplicitAny": true,
"skipLibCheck": true,
"strict": true
},
"include": [
"*.ts"
],
"exclude": [
"**/node_modules/**",
"**/*.test.ts"
]
}
import crypto from "node:crypto"
import { AddressObject, ParsedMail } from "mailparser";
import dayjs from "dayjs";
export const digestSha256 = (buf: Buffer): string =>
crypto.createHash('sha256').update(buf).digest('hex');
export const getOutputFilePath = (mail: ParsedMail): string => {
const { date, subject } = mail;
const safeFileName = normalizeToFilename(subject);
return `${dayjs(date).format("YYYY/MM/YYYY-MM-DD_HH-mm-ss")}_${safeFileName}`
}
const normalizeToFilename = (filename: string | null | undefined): string =>
(filename ?? "NO_SUBJECT")
.normalize()
.replace(/:/g, "")
.replace(new RegExp("[<>\"|/?*\\\\]", "g"), "__")
.split("")
.map((c) => {
const code = c.charCodeAt(0);
if (
(0x21 <= code && code <= 0x007E) || // Ascii
(0x3000 <= code && code <= 0x3002) || // 全角スペース、。
(0x3008 <= code && code <= 0x301B) || // かっこ
(0x3040 <= code && code <= 0x309F) || // ひらがな
(0x30A0 <= code && code <= 0x30FF) || // カタカナ
(0x4E00 <= code && code <= 0x9FFF) || // 漢字
(0xFF01 <= code && code <= 0xFF9F) // 全角英数字、半角カタカナなど
) {
return c;
}
return "_";
})
.join("")
.substring(0, 60);
export const getAddressText = (address: AddressObject | AddressObject[] | undefined): string | null => {
if (address != null) {
return (Array.isArray(address) ? address : [address])
.map(({ value }) => (Array.isArray(value)) ? value.join(", ") : value)
.join(", ");
}
return null;
};
const numberWithComma = new Intl.NumberFormat();
export const toDisplay = (num: number, pad = 1): string => numberWithComma.format(num).padStart(pad, " ");
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment