Skip to content

Instantly share code, notes, and snippets.

@daniel31x13
Last active May 29, 2025 00:40
Show Gist options
  • Save daniel31x13/569233e04a987d86350f467d7ed83f29 to your computer and use it in GitHub Desktop.
Save daniel31x13/569233e04a987d86350f467d7ed83f29 to your computer and use it in GitHub Desktop.
A script to automate the import of ArchiveBox snapshots into your Linkwarden instance by directly executing SQL against the Postgres database and arranging the archived files into Linkwarden's expected directory structure.

archivebox2linkwarden.js

Description

This script automates the import of ArchiveBox snapshots into a Linkwarden instance by directly executing SQL against the Postgres database and arranging the archived files into Linkwarden's expected directory structure. Just make sure to have a backup of your Linkwarden database before running this script just in case.

It handles two connection modes:

  • Docker mode: Executes psql inside a running Postgres container via docker exec, specified by the --postgres-container-id flag.

  • Host mode: Uses the DATABASE_URL environment variable to connect via the local psql client.

Note

To know if you need to take the Docker mode or Host mode, see how you installed Linkwarden, if you took the Docker installation route, you need to use the Docker mode, if you took the manual installation route, you need to use the Host mode.

How It Works

For each ArchiveBox subdirectory under the given path, it:

  1. Reads index.json to extract title, URL, and bookmark timestamp.

  2. Inserts a "Link" row in Postgres, capturing the new link ID.

  3. Copies singlefile.htmloutput.pdfscreenshot.png (if present) into linkwarden-archives/{collectionId}/{linkId}.[html|pdf|png].

  4. Merges readability output (article.jsoncontent.htmlcontent.txt) into linkwarden-archives/{collectionId}/{linkId}_readability.json.

  5. Updates the newly created Link record's imagepdfmonolith, and readable columns to point at those files.

Requirements

  • Node.js

Usage Example

Docker mode

./archivebox2linkwarden.js --path ./data/archive --user-id 1 --postgres-container-id 123456789abc

Host mode

export DATABASE_URL=postgresql://user:pass@localhost:5432/postgres && ./archivebox2linkwarden.js --path ./data/archive --user-id 17

Required Flags

  • --path <archiveDir>: path to the ArchiveBox's "archive" directory

  • --user-id <number>: Linkwarden user ID for ownership, it's usually 1 if you are the admin

  • --postgres-container-id <string>: Docker container ID for linkwarden-postgres, can be found with the docker ps command (optional if the DATABASE_URL environment variable is set)

Output

  • Inserts a new Collection named "Imports" and its Links into the Linkwarden database

  • Creates a linkwarden-archives/{collectionId}/ directory containing the archives

  • Logs progress and final instructions

#!/usr/bin/env node
const fs = require("fs");
const path = require("path");
const { execSync } = require("child_process");
const raw = process.argv.slice(2);
function getFlag(names) {
for (let n of names) {
const idx = raw.indexOf(n);
if (idx !== -1 && idx + 1 < raw.length) return raw[idx + 1];
}
}
const ARCHIVE_ROOT = getFlag(["--path"]);
const USER_ID = parseInt(getFlag(["--user-id"]), 10);
const POSTGRES_CONTAINER = getFlag(["--postgres-container-id"]);
if (!ARCHIVE_ROOT || isNaN(USER_ID)) {
console.error(
"Usage: ./archivebox2linkwarden.js --path <archiveDir> --user-id <yourUserId> --postgres-container-id <containerId>"
);
process.exit(1);
}
const DATABASE_URL = process.env.DATABASE_URL;
if (!POSTGRES_CONTAINER && !DATABASE_URL) {
console.error(
"Error: no DATABASE_URL or --postgres-container-id flag specified.\n" +
"Please set DATABASE_URL=postgres://… (or set the --postgres-container-id flag if you are using Docker)"
);
process.exit(1);
}
// escape single quotes for SQL literals
function esc(str) {
return str.replace(/'/g, "''");
}
// normalize "YYYY-MM-DD HH:mm" → ISO8601
function toIso(s) {
return new Date(s.replace(" ", "T") + ":00Z").toISOString();
}
/**
* Run SQL via either host psql or docker exec.
* returnOutput=true captures and returns the output (for RETURNING id).
*/
function runPSQL(sql, returnOutput = false) {
const flags = returnOutput ? "-q -t -A --no-psqlrc" : "-q --no-psqlrc";
const escaped = sql.replace(/"/g, '\\"');
let cmd, opts;
if (POSTGRES_CONTAINER) {
// exec into given container
cmd = `docker exec -i ${POSTGRES_CONTAINER} psql -U postgres -d postgres ${flags} -c "${escaped}"`;
opts = returnOutput ? { stdio: "pipe" } : { stdio: "inherit" };
} else {
cmd = `psql "${DATABASE_URL}" ${flags} -c "${escaped}"`;
opts = returnOutput ? { stdio: "pipe" } : { stdio: "inherit" };
}
const out = execSync(cmd, opts);
return returnOutput ? out.toString().trim() : null;
}
function main() {
// 1) discover all timestamped subdirs, sorted numerically
const subdirs = fs
.readdirSync(ARCHIVE_ROOT)
.filter((d) => fs.statSync(path.join(ARCHIVE_ROOT, d)).isDirectory())
.sort((a, b) => parseFloat(a) - parseFloat(b));
// 2) create the “Imports” collection
const collName = esc("Imports").slice(0, 254);
const collDesc = esc("").slice(0, 254);
const collColor = esc("#0ea5e9").slice(0, 50);
const collSql = `
INSERT INTO "Collection"
(name, description, color, "ownerId", "createdById", "createdAt", "updatedAt", "isPublic")
VALUES
('${collName}','${collDesc}','${collColor}',${USER_ID},${USER_ID},NOW(),NOW(),false)
RETURNING id;
`;
const collectionId = runPSQL(collSql, true);
console.log(`✔ Created collection ${collectionId}`);
// prepare destination folder
const DEST_ROOT = "linkwarden-archives";
const collDir = path.join(DEST_ROOT, collectionId);
fs.mkdirSync(collDir, { recursive: true });
// 3) insert each link, copy files, then UPDATE file columns
subdirs.forEach((subdir, idx) => {
const srcDir = path.join(ARCHIVE_ROOT, subdir);
const idxPath = path.join(srcDir, "index.json");
if (!fs.existsSync(idxPath)) {
console.warn(`⚠️ skipping ${subdir} (no index.json)`);
return;
}
const data = JSON.parse(fs.readFileSync(idxPath, "utf8"));
const name = esc(data.title || data.domain || data.url || "").slice(0, 254);
const url = esc(data.url || "").slice(0, 2047);
const importDate = data.bookmarked_date
? toIso(data.bookmarked_date)
: new Date().toISOString();
// insert the link and capture its id
const linkSql = `
INSERT INTO "Link"
(url, name, description, "importDate", "collectionId", "createdById", "createdAt", "updatedAt")
VALUES
('${url}','${name}','', '${importDate}', ${collectionId}, ${USER_ID}, NOW(), NOW())
RETURNING id;
`;
const linkId = runPSQL(linkSql, true);
// base path for all artifacts
const base = path.join(collDir, linkId);
// copy singlefile.html → {linkId}.html
const sf = path.join(srcDir, "singlefile.html");
if (fs.existsSync(sf)) {
fs.copyFileSync(sf, base + ".html");
}
// copy output.pdf → {linkId}.pdf
const pd = path.join(srcDir, "output.pdf");
if (fs.existsSync(pd)) {
fs.copyFileSync(pd, base + ".pdf");
}
// copy screenshot.png → {linkId}.png
const sn = path.join(srcDir, "screenshot.png");
if (fs.existsSync(sn)) {
fs.copyFileSync(sn, base + ".png");
}
// merge readability → {linkId}_readability.json
const rdDir = path.join(srcDir, "readability");
if (fs.existsSync(rdDir)) {
let meta = {};
const mF = path.join(rdDir, "article.json");
if (fs.existsSync(mF)) meta = JSON.parse(fs.readFileSync(mF, "utf8"));
const hF = path.join(rdDir, "content.html");
const tF = path.join(rdDir, "content.txt");
meta.content = fs.existsSync(hF) ? fs.readFileSync(hF, "utf8") : "";
meta.textContent = fs.existsSync(tF) ? fs.readFileSync(tF, "utf8") : "";
fs.writeFileSync(
base + "_readability.json",
JSON.stringify(meta, null, 2)
);
}
// build the UPDATE clause for any files we copied
const updates = [];
if (fs.existsSync(base + ".png")) {
updates.push(
`image='${path.posix.join(`archives/${collectionId}/${linkId}.png`)}'`
);
}
if (fs.existsSync(base + ".pdf")) {
updates.push(
`pdf='${path.posix.join(`archives/${collectionId}/${linkId}.pdf`)}'`
);
}
if (fs.existsSync(base + ".html")) {
updates.push(
`monolith='${path.posix.join(
`archives/${collectionId}/${linkId}.html`
)}'`
);
}
if (fs.existsSync(base + "_readability.json")) {
updates.push(
`readable='${path.posix.join(
`archives/${collectionId}/${linkId}_readability.json`
)}'`
);
}
if (updates.length) {
const updSql = `
UPDATE "Link"
SET ${updates.join(", ")}
WHERE id = ${linkId};
`;
runPSQL(updSql, false);
}
});
console.log("🎉 Import complete!");
console.log("Created a 'linkwarden-archives' directory.");
console.warn(
`⚠️ One final (but important) step: replace the 'linkwarden-archives/${collectionId}' directory with your Linkwarden instance’s 'data/archives/${collectionId}' directory.`
);
}
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment