This Gist contains code we used at Causal to import our old Markdown documentation content into Sanity. See more on our blog!
Created
April 8, 2024 22:09
-
-
Save a-churchill/a562e1857364649f88aff03102982fba to your computer and use it in GitHub Desktop.
Importing Markdown to Sanity
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// This script is used to convert Markdown files (e.g. as generated by GitBook) to a format that we | |
// can import into Sanity. | |
import { htmlToBlocks, randomKey } from "@sanity/block-tools"; | |
import { createClient } from "@sanity/client"; | |
import { readFile, writeFile } from "fs/promises"; | |
import { JSDOM } from "jsdom"; | |
import { toString } from "mdast-util-to-string"; | |
import { basename, extname } from "path"; | |
import { resolve } from "path"; | |
import rehypeRaw from "rehype-raw"; | |
import rehypeStringify from "rehype-stringify"; | |
import remarkExtractFrontmatter from "remark-extract-frontmatter"; | |
import remarkFrontmatter from "remark-frontmatter"; | |
import remarkGfm from "remark-gfm"; | |
import remarkParse from "remark-parse"; | |
import remarkRehype from "remark-rehype"; | |
import { unified } from "unified"; | |
import { EXIT, visitParents } from "unist-util-visit-parents"; | |
import { parse } from "yaml"; | |
import { schema } from "./schema.mjs"; | |
const client = createClient({ | |
apiVersion: "2021-08-31", | |
projectId: "xxxxxxxx", | |
dataset: "production", | |
token: process.env.SANITY_TOKEN, | |
}); | |
/** Map from image's original filename to its ID in sanity. */ | |
const imagesByOriginalFilename = client | |
.fetch('*[_type == "sanity.imageAsset"]{_id, originalFilename}') | |
.then(images => new Map(images.map(image => [image.originalFilename, image._id]))); | |
// The compiled schema type for the content type that holds the block array | |
const blockContentType = schema.get("docPage").fields.find(field => field.name === "content").type; | |
function capitalize(string) { | |
if (string === "") return string; | |
return string.charAt(0).toUpperCase() + string.slice(1); | |
} | |
async function convertMarkdownToHtml(markdown) { | |
/** Puts the title of the document on the data property, and removes it from the document. */ | |
function extractTitle() { | |
return (node, file) => { | |
let title = ""; | |
visitParents(node, "heading", (node, ancestors) => { | |
if (node.depth === 1) { | |
title = toString(node); | |
const parent = ancestors[ancestors.length - 1]; | |
parent.children = parent.children.filter(n => n !== node); | |
return EXIT; | |
} | |
}); | |
file.data.title = title; | |
}; | |
} | |
const result = await unified() | |
.use(extractTitle) | |
.use(remarkParse) | |
.use(remarkGfm) // we need this to support tables | |
.use(remarkFrontmatter, { type: "yaml", marker: "-" }) | |
.use(remarkExtractFrontmatter, { yaml: parse }) | |
.use(remarkRehype, { allowDangerousHtml: true }) | |
.use(rehypeRaw) | |
.use(rehypeStringify) | |
.process(markdown); | |
return { html: result.value, data: result.data }; | |
} | |
/** Reads a file at the given path and returns an object `{ html, data }`. */ | |
async function convertMarkdownFileToHtml(filePath) { | |
console.error("Parsing file: ", filePath); | |
const rawText = await readFile(filePath, "utf-8"); | |
const processedText = rawText | |
// Hints with markdown inside them won't get parsed correctly unless we first wrap them in a div. | |
.replaceAll("{% hint style=", "<aside>{% hint style=") | |
.replaceAll('" %}', '" %}\n') // without this, markdown right after the hint won't be parsed | |
.replaceAll("{% endhint %}", "{% endhint %}</aside>") | |
.replaceAll("{% endembed %}", ""); // remove the endembed tag which we ignore anyway | |
return convertMarkdownToHtml(processedText); | |
} | |
async function convertHtmlToSanity({ filePath, html, data, sectionSlugs, imagesPath, files }) { | |
const images = await imagesByOriginalFilename; | |
const { title, description } = data; | |
const path = filePath.split("causal-docs/")[1]; | |
const pathComponents = path.split("/"); | |
if (pathComponents.length < 2) { | |
console.error("Path doesn't have section: ", path); | |
throw new Error("Path doesn't have section"); | |
} else if (pathComponents.length === 2 && pathComponents[1] === "README.md") { | |
console.error("Section cannot have README"); | |
throw new Error("Section cannot have README"); | |
} else if (pathComponents.length > 3) { | |
console.error("Path is too deep: ", path); | |
throw new Error("Path is too deep"); | |
} | |
let [section, slug] = pathComponents; | |
let parentPage = undefined; | |
if (pathComponents.length === 3 && pathComponents[2] !== "README.md") { | |
[section, parentPage, slug] = pathComponents; | |
} | |
sectionSlugs.add(section); | |
slug = basename(slug, extname(slug)); | |
console.error("Converting to Sanity document: ", JSON.stringify({ section, parentPage, slug })); | |
const htmlToBlocksRules = [ | |
{ | |
// Code blocks | |
deserialize: (el, next, block) => { | |
if (el.tagName?.toLowerCase() !== "pre") return undefined; | |
const codeNode = el.children[0]; | |
const childNodes = | |
codeNode && codeNode.tagName.toLowerCase() === "code" | |
? codeNode.childNodes | |
: el.childNodes; | |
let code = ""; | |
childNodes.forEach(node => { | |
code += node.textContent; | |
}); | |
return block({ _type: "code", code }); | |
}, | |
}, | |
{ | |
// Loom/YouTube embed blocks | |
deserialize: (el, next, block) => { | |
if (el.tagName?.toLowerCase() !== "p") return undefined; | |
if (!el.textContent?.startsWith('{% embed url="')) return undefined; | |
const url = el.textContent.replace('{% embed url="', "").replace('" %}', ""); | |
if (url.startsWith("https://www.youtube.com")) { | |
return block({ _type: "youtube", url }); | |
} else if (url.startsWith("https://www.loom.com")) { | |
return block({ _type: "loom", url }); | |
} else { | |
console.warn(`Unknown embed URL: ${url}`); | |
return undefined; | |
} | |
}, | |
}, | |
{ | |
// Images and figures | |
deserialize: (el, next, block) => { | |
if (el.tagName?.toLowerCase() !== "img" && el.tagName?.toLowerCase() !== "figure") | |
return undefined; | |
let src, caption; | |
if (el.tagName.toLowerCase() === "img") { | |
src = el.getAttribute("src"); | |
caption = ""; | |
} else { | |
const img = el.querySelector("img"); | |
src = img.getAttribute("src"); | |
caption = el.querySelector("figcaption")?.textContent ?? ""; | |
} | |
const filename = basename(decodeURIComponent(src)); | |
if (src.startsWith("http")) { | |
// download the image to the images path | |
console.warn(`❗ Download image before import: ${src}`); | |
} | |
const imageId = images.get(filename); | |
if (imageId == null) { | |
const path = resolve(imagesPath, filename); | |
return block({ _type: "image", caption, _sanityAsset: `image@file://${path}` }); | |
} | |
return block({ _type: "image", caption, asset: { _type: "reference", _ref: imageId } }); | |
}, | |
}, | |
{ | |
// Hint blocks | |
deserialize: (el, next, block) => { | |
if (el.tagName?.toLowerCase() !== "aside") return undefined; | |
if (!el.textContent?.startsWith('{% hint style="')) return undefined; | |
const regex = /\{% hint style="(\w+)" %} (.*) {% endhint %\}(.*)/; | |
try { | |
const [, style, ...text] = el.innerHTML.match(regex); | |
const content = htmlToBlocks(`<div>${text.join("")}</div>`, blockContentType, { | |
parseHtml: html => new JSDOM(html).window.document, | |
rules: htmlToBlocksRules, | |
}); | |
return block({ _type: "callout", style, content }); | |
} catch (e) { | |
console.warn(`Failed to parse hint: ${el.innerHTML}`); | |
throw e; | |
} | |
}, | |
}, | |
{ | |
// Summary/Detail blocks | |
deserialize: (el, next, block) => { | |
if (el.tagName?.toLowerCase() !== "details") return undefined; | |
const summaryElement = el.querySelector("summary"); | |
if (summaryElement == null) { | |
console.warn("Details element without summary"); | |
return undefined; | |
} | |
const title = summaryElement.textContent; | |
el.removeChild(summaryElement); | |
const content = htmlToBlocks(el.innerHTML, blockContentType, { | |
rules: htmlToBlocksRules, | |
parseHtml: html => new JSDOM(html).window.document, | |
}); | |
return block({ _type: "collapsible", title, content }); | |
}, | |
}, | |
{ | |
// "{% content-ref" blocks | |
deserialize: (el, next, block) => { | |
if (el.tagName?.toLowerCase() !== "p") return undefined; | |
if (!el.textContent?.startsWith("{% content-ref")) return undefined; | |
return htmlToBlocks("", blockContentType, { | |
parseHtml: html => new JSDOM(html).window.document, | |
}); | |
}, | |
}, | |
{ | |
// Links to other documentation pages | |
deserialize: (el, next) => { | |
if (el.tagName?.toLowerCase() !== "a") return undefined; | |
let href = el.getAttribute("href"); | |
if (href == null) return undefined; | |
if (el.getAttribute("href")?.startsWith("http") === true) return undefined; // default handling is fine | |
// GitBook uses broken-reference to indicate a link to a non-existent page | |
if (href === "broken-reference") return next(el.childNodes); | |
href = href.split("#")[0]; | |
if (href.endsWith("/")) { | |
href = href.slice(0, -1); | |
} | |
const slug = basename(href, extname(href)); | |
if (files.every(file => basename(file, extname(file)) !== slug)) return undefined; | |
// had to look at the source code to figure this one out -___- | |
return { | |
_type: "__annotation", | |
markDef: { | |
_key: randomKey(12), | |
_type: "internalLink", | |
reference: { _type: "reference", _ref: slug }, | |
}, | |
children: next(el.childNodes), | |
}; | |
}, | |
}, | |
{ | |
// Tables | |
deserialize: (el, next, block) => { | |
if (el.tagName?.toLowerCase() !== "table") return undefined; | |
const headers = el.querySelectorAll("thead th"); | |
const columns = [...headers].map(th => th.textContent); | |
const rows = el.querySelectorAll("tbody tr"); | |
const data = [...rows].map(row => { | |
const cells = row.querySelectorAll("td"); | |
return [...cells].slice(0, columns.length).map(cell => cell.textContent); | |
}); | |
return block({ | |
_type: "table", | |
rows: [columns, ...data].map(cells => ({ _type: "tableRow", cells })), | |
}); | |
}, | |
}, | |
// Manual: | |
// - Tabs | |
]; | |
const portableText = htmlToBlocks( | |
html, | |
blockContentType, | |
{ rules: htmlToBlocksRules, parseHtml: html => new JSDOM(html).window.document }, | |
{ allowedDecorators: true }, | |
); | |
const isResourcePage = section === "resources"; | |
return { | |
_type: isResourcePage ? "resource" : "docPage", | |
_id: slug, | |
title, | |
description, | |
slug: { _type: "slug", current: slug }, | |
content: portableText, | |
...(isResourcePage | |
? {} | |
: { | |
parentSection: { _type: "reference", _ref: section }, | |
parentPage: parentPage != null ? { _type: "reference", _ref: parentPage } : undefined, | |
}), | |
}; | |
} | |
async function main() { | |
if (process.argv.length < 4) { | |
console.error("Usage: importMarkdown.mjs <path-to-images> <path> [...<path>]"); | |
process.exit(1); | |
} | |
const imagesPath = process.argv[2]; | |
const files = process.argv.slice(3); | |
const sectionSlugs = new Set(); | |
const docPages = await Promise.all( | |
files.map(async filePath => { | |
const { html, data } = await convertMarkdownFileToHtml(filePath); | |
await writeFile(filePath + ".html", html); | |
const doc = await convertHtmlToSanity({ | |
filePath, | |
html, | |
data, | |
sectionSlugs, | |
imagesPath, | |
files, | |
}); | |
await writeFile(filePath + ".json", JSON.stringify(doc, null, 2)); | |
return doc; | |
}), | |
); | |
const sections = [...sectionSlugs].map(slug => ({ | |
_type: "docSection", | |
_id: slug, | |
title: slug | |
.split("-") | |
.map(s => (s === "and" ? s : capitalize(s))) | |
.join(" "), | |
slug: { _type: "slug", current: slug }, | |
})); | |
console.error(`Converted ${docPages.length} documents`); | |
console.error(`Found ${sections.length} sections`); | |
const sectionsResult = sections.reduce((acc, s) => `${acc}${JSON.stringify(s)}\n`, ""); | |
const result = docPages.reduce((acc, doc) => `${acc}${JSON.stringify(doc)}\n`, sectionsResult); | |
await writeFile("sanity-import.ndjson", result); | |
console.error("Wrote sanity-import.ndjson"); | |
} | |
main(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment