Skip to content

Instantly share code, notes, and snippets.

@skylord123
Last active February 28, 2024 03:28
Show Gist options
  • Save skylord123/0ec487dc3259f73e11da597eb6d1d495 to your computer and use it in GitHub Desktop.
Save skylord123/0ec487dc3259f73e11da597eb6d1d495 to your computer and use it in GitHub Desktop.
Node-RED node to convert markdown to HTML for Matrix

Node-RED mark2html for matrix

This is taken from matrix-react-sdk on Feb 27, 2024. The idea is to generate markdown as close to Element as possible. The code was converted from TypeScript to JavaScript so it could be easily used inside a Node-RED function node.

Simply import the contents of flow.json using the hamburger menu in Node-RED.

The function node takes a msg.payload does the conversion then gives you back a new msg.payload property out to the first output. If there is an error it routes to the second output with msg.payload being the error message.

image

Result for the default example markdown
# Hello, CommonMark!

This is an example of converting Markdown to HTML using CommonMark in JavaScript.

## Features

- **Bold**
- *Italics*
- `Code snippets`

> Blockquote

1. Ordered list
2. Second item

[Link](http://example.com)

![Image](http://example.com/image.png)

image

[{"id":"679f4cef853e9f7a","type":"function","z":"c18eb91129538d4b","name":"Matrix markdown2html","func":"// author: Skylord123 https://github.com/skylord123\n// source: https://gist.github.com/skylord123/0ec487dc3259f73e11da597eb6d1d495/\n\n// some defaults\nconst ALLOWED_HTML_TAGS = [\"sub\", \"sup\", \"del\", \"u\", \"br\", \"br/\"];\n\n// These types of node are definitely text\nconst TEXT_NODES = [\"text\", \"softbreak\", \"linebreak\", \"paragraph\", \"document\"];\n\nlet escaper = lodash.escape;\n\nfunction isAllowedHtmlTag(node) {\n if (!node.literal) {\n return false\n }\n\n if (\n node.literal.match('^<((div|span) data-mx-maths=\"[^\"]*\"|/(div|span))>$') !=\n null\n ) {\n return true\n }\n\n // Regex won't work for tags with attrs, but the tags we allow\n // shouldn't really have any anyway.\n const matches = /^<\\/?(.*)>$/.exec(node.literal)\n if (matches && matches.length == 2) {\n const tag = matches[1]\n return ALLOWED_HTML_TAGS.indexOf(tag) > -1\n }\n\n return false\n}\n\n/*\n * Returns true if the parse output containing the node\n * comprises multiple block level elements (ie. lines),\n * or false if it is only a single line.\n */\nfunction isMultiLine(node) {\n let par = node\n while (par.parent) {\n par = par.parent\n }\n return par.firstChild != par.lastChild\n}\n\nfunction getTextUntilEndOrLinebreak(node) {\n let currentNode = node\n let text = \"\"\n while (\n currentNode &&\n currentNode.type !== \"softbreak\" &&\n currentNode.type !== \"linebreak\"\n ) {\n const { literal, type } = currentNode\n if (type === \"text\" && literal) {\n let n = 0\n let char = literal[n]\n while (char !== \" \" && char !== null && n <= literal.length) {\n if (char === \" \") {\n break\n }\n if (char) {\n text += char\n }\n n += 1\n char = literal[n]\n }\n if (char === \" \") {\n break\n }\n }\n currentNode = currentNode.next\n }\n return text\n}\n\nconst formattingChangesByNodeType = {\n emph: \"_\",\n strong: \"__\"\n}\n\n/**\n * Returns the literal of a node an all child nodes.\n */\nconst innerNodeLiteral = node => {\n let literal = \"\"\n\n const walker = node.walker()\n let step\n\n while ((step = walker.next())) {\n const currentNode = step.node\n const currentNodeLiteral = currentNode.literal\n if (step.entering && currentNode.type === \"text\" && currentNodeLiteral) {\n literal += currentNodeLiteral\n }\n }\n\n return literal\n}\n\nconst emptyItemWithNoSiblings = node => {\n return !node.prev && !node.next && !node.firstChild\n}\n\n/**\n * Class that wraps commonmark, adding the ability to see whether\n * a given message actually uses any markdown syntax or whether\n * it's plain text.\n */\nconst Markdown = function(input) {\n this.constructor = function(input) {\n this.input = input\n\n const parser = new commonmark.Parser()\n this.parsed = parser.parse(this.input)\n this.parsed = this.repairLinks(this.parsed)\n };\n\n /**\n * This method is modifying the parsed AST in such a way that links are always\n * properly linkified instead of sometimes being wrongly emphasised in case\n * if you were to write a link like the example below:\n * https://my_weird-link_domain.domain.com\n * ^ this link would be parsed to something like this:\n * <a href=\"https://my\">https://my</a><b>weird-link</b><a href=\"https://domain.domain.com\">domain.domain.com</a>\n * This method makes it so the link gets properly modified to a version where it is\n * not emphasised until it actually ends.\n * See: https://github.com/vector-im/element-web/issues/4674\n * @param parsed\n */\n this.repairLinks = function(parsed) {\n const walker = parsed.walker()\n let event = null\n let text = \"\"\n let isInPara = false\n let previousNode = null\n let shouldUnlinkFormattingNode = false\n while ((event = walker.next())) {\n const { node } = event\n if (node.type === \"paragraph\") {\n if (event.entering) {\n isInPara = true\n } else {\n isInPara = false\n }\n }\n if (isInPara) {\n // Clear saved string when line ends\n if (\n node.type === \"softbreak\" ||\n node.type === \"linebreak\" ||\n // Also start calculating the text from the beginning on any spaces\n (node.type === \"text\" && node.literal === \" \")\n ) {\n text = \"\"\n continue\n }\n\n // Break up text nodes on spaces, so that we don't shoot past them without resetting\n if (node.type === \"text\" && node.literal) {\n const [thisPart, ...nextParts] = node.literal.split(/( )/)\n node.literal = thisPart\n text += thisPart\n\n // Add the remaining parts as siblings\n nextParts.reverse().forEach(part => {\n if (part) {\n const nextNode = new commonmark.Node(\"text\")\n nextNode.literal = part\n node.insertAfter(nextNode)\n // Make the iterator aware of the newly inserted node\n walker.resumeAt(nextNode, true)\n }\n })\n }\n\n // We should not do this if previous node was not a textnode, as we can't combine it then.\n if (\n (node.type === \"emph\" || node.type === \"strong\") &&\n previousNode?.type === \"text\"\n ) {\n if (event.entering) {\n const foundLinks = linkify.find(text)\n for (const { value } of foundLinks) {\n if (node?.firstChild?.literal) {\n /**\n * NOTE: This technically should unlink the emph node and create LINK nodes instead, adding all the next elements as siblings\n * but this solution seems to work well and is hopefully slightly easier to understand too\n */\n const format = formattingChangesByNodeType[node.type]\n const nonEmphasizedText = `${format}${innerNodeLiteral(\n node\n )}${format}`\n const f = getTextUntilEndOrLinebreak(node)\n const newText = value + nonEmphasizedText + f\n const newLinks = linkify.find(newText)\n // Should always find only one link here, if it finds more it means that the algorithm is broken\n if (newLinks.length === 1) {\n const emphasisTextNode = new commonmark.Node(\"text\")\n emphasisTextNode.literal = nonEmphasizedText\n previousNode.insertAfter(emphasisTextNode)\n node.firstChild.literal = \"\"\n event = node.walker().next()\n if (event) {\n // Remove `em` opening and closing nodes\n node.unlink()\n previousNode.insertAfter(event.node)\n shouldUnlinkFormattingNode = true\n }\n } else {\n node.error(\"Markdown links escaping found too many links for following text: \" + text, msg);\n node.error(\"Markdown links escaping found too many links for modified text: \" + newText, msg);\n }\n }\n }\n } else {\n if (shouldUnlinkFormattingNode) {\n node.unlink()\n shouldUnlinkFormattingNode = false\n }\n }\n }\n }\n previousNode = node\n }\n return parsed\n };\n\n this.isPlainText = function() {\n const walker = this.parsed.walker()\n let ev\n\n while ((ev = walker.next())) {\n const node = ev.node\n\n if (TEXT_NODES.indexOf(node.type) > -1) {\n // definitely text\n continue\n } else if (node.type == \"list\" || node.type == \"item\") {\n // Special handling for inputs like `+`, `*`, `-` and `2021.` which\n // would otherwise be treated as a list of a single empty item.\n // See https://github.com/vector-im/element-web/issues/7631\n if (\n node.type == \"list\" &&\n node.firstChild &&\n emptyItemWithNoSiblings(node.firstChild)\n ) {\n // A list with a single empty item is treated as plain text.\n continue\n }\n\n if (node.type == \"item\" && emptyItemWithNoSiblings(node)) {\n // An empty list item with no sibling items is treated as plain text.\n continue\n }\n\n // Everything else is actual lists and therefore not plaintext.\n return false\n } else if (node.type == \"html_inline\" || node.type == \"html_block\") {\n // if it's an allowed html tag, we need to render it and therefore\n // we will need to use HTML. If it's not allowed, it's not HTML since\n // we'll just be treating it as text.\n if (isAllowedHtmlTag(node)) {\n return false\n }\n } else {\n return false\n }\n }\n return true\n };\n\n this.toHTML = function({ externalLinks = false } = {}) {\n const renderer = new commonmark.HtmlRenderer({\n safe: false,\n\n // Set soft breaks to hard HTML breaks: commonmark\n // puts softbreaks in for multiple lines in a blockquote,\n // so if these are just newline characters then the\n // block quote ends up all on one line\n // (https://github.com/vector-im/element-web/issues/3154)\n softbreak: \"<br />\"\n })\n\n // Trying to strip out the wrapping <p/> causes a lot more complication\n // than it's worth, i think. For instance, this code will go and strip\n // out any <p/> tag (no matter where it is in the tree) which doesn't\n // contain \\n's.\n // On the flip side, <p/>s are quite opionated and restricted on where\n // you can nest them.\n //\n // Let's try sending with <p/>s anyway for now, though.\n const realParagraph = renderer.paragraph\n renderer.paragraph = function (node, entering) {\n // If there is only one top level node, just return the\n // bare text: it's a single line of text and so should be\n // 'inline', rather than unnecessarily wrapped in its own\n // p tag. If, however, we have multiple nodes, each gets\n // its own p tag to keep them as separate paragraphs.\n // However, if it's a blockquote, adds a p tag anyway\n // in order to avoid deviation to commonmark and unexpected\n // results when parsing the formatted HTML.\n if (node.parent?.type === \"block_quote\" || isMultiLine(node)) {\n realParagraph.call(this, node, entering)\n }\n }\n\n renderer.link = function (node, entering) {\n const attrs = this.attrs(node)\n if (entering && node.destination) {\n attrs.push([\"href\", this.esc(node.destination)])\n if (node.title) {\n attrs.push([\"title\", this.esc(node.title)])\n }\n // Modified link behaviour to treat them all as external and\n // thus opening in a new tab.\n if (externalLinks) {\n attrs.push([\"target\", \"_blank\"])\n attrs.push([\"rel\", \"noreferrer noopener\"])\n }\n this.tag(\"a\", attrs)\n } else {\n this.tag(\"/a\")\n }\n }\n\n renderer.html_inline = function (node) {\n if (node.literal) {\n if (isAllowedHtmlTag(node)) {\n this.lit(node.literal)\n } else {\n this.lit(escaper(node.literal))\n }\n }\n }\n\n renderer.html_block = function (node) {\n /*\n // as with `paragraph`, we only insert line breaks\n // if there are multiple lines in the markdown.\n const isMultiLine = is_multi_line(node);\n if (isMultiLine) this.cr();\n */\n renderer.html_inline(node)\n /*\n if (isMultiLine) this.cr();\n */\n }\n\n return renderer.render(this.parsed)\n };\n\n /*\n * Render the markdown message to plain text. That is, essentially\n * just remove any backslashes escaping what would otherwise be\n * markdown syntax\n * (to fix https://github.com/vector-im/element-web/issues/2870).\n *\n * N.B. this does **NOT** render arbitrary MD to plain text - only MD\n * which has no formatting. Otherwise it emits HTML(!).\n */\n this.toPlaintext = function() {\n const renderer = new commonmark.HtmlRenderer({ safe: false })\n\n renderer.paragraph = function (node, entering) {\n // as with toHTML, only append lines to paragraphs if there are\n // multiple paragraphs\n if (isMultiLine(node)) {\n if (!entering && node.next) {\n this.lit(\"\\n\\n\")\n }\n }\n }\n\n renderer.html_block = function (node) {\n if (node.literal) this.lit(node.literal)\n if (isMultiLine(node) && node.next) this.lit(\"\\n\\n\")\n }\n\n return renderer.render(this.parsed)\n };\n \n this.constructor(input);\n}\n\ntry {\n let md = new Markdown(msg.payload);\n msg.payload = md.toHTML();\n} catch(e) {\n msg.payload = e;\n return [null, msg];\n}\n\nreturn msg;","outputs":2,"timeout":0,"noerr":0,"initialize":"","finalize":"","libs":[{"var":"commonmark","module":"commonmark"},{"var":"lodash","module":"lodash"},{"var":"linkify","module":"linkifyjs"}],"x":620,"y":1560,"wires":[["199c9687ef9e7754"],[]]},{"id":"ceb81dce7deea71a","type":"inject","z":"c18eb91129538d4b","name":"","props":[],"repeat":"","crontab":"","once":false,"onceDelay":0.1,"topic":"","x":270,"y":1560,"wires":[["6679d6024e9aad9d"]]},{"id":"199c9687ef9e7754","type":"debug","z":"c18eb91129538d4b","name":"debug 140","active":true,"tosidebar":true,"console":false,"tostatus":false,"complete":"true","targetType":"full","statusVal":"","statusType":"auto","x":810,"y":1560,"wires":[]},{"id":"6679d6024e9aad9d","type":"function","z":"c18eb91129538d4b","name":"Set markdown","func":"msg.payload = `\n# Hello, CommonMark!\n\nThis is an example of converting Markdown to HTML using CommonMark in JavaScript.\n\n## Features\n\n- **Bold**\n- *Italics*\n- \\`Code snippets\\`\n\n> Blockquote\n\n1. Ordered list\n2. Second item\n\n[Link](http://example.com)\n\n![Image](http://example.com/image.png)\n\n`;\nreturn msg;","outputs":1,"timeout":0,"noerr":0,"initialize":"","finalize":"","libs":[],"x":420,"y":1560,"wires":[["679f4cef853e9f7a"]]}]
// author: Skylord123 https://github.com/skylord123
// source: https://gist.github.com/skylord123/0ec487dc3259f73e11da597eb6d1d495/
// some defaults
const ALLOWED_HTML_TAGS = ["sub", "sup", "del", "u", "br", "br/"];
// These types of node are definitely text
const TEXT_NODES = ["text", "softbreak", "linebreak", "paragraph", "document"];
let escaper = lodash.escape;
function isAllowedHtmlTag(node) {
if (!node.literal) {
return false
}
if (
node.literal.match('^<((div|span) data-mx-maths="[^"]*"|/(div|span))>$') !=
null
) {
return true
}
// Regex won't work for tags with attrs, but the tags we allow
// shouldn't really have any anyway.
const matches = /^<\/?(.*)>$/.exec(node.literal)
if (matches && matches.length == 2) {
const tag = matches[1]
return ALLOWED_HTML_TAGS.indexOf(tag) > -1
}
return false
}
/*
* Returns true if the parse output containing the node
* comprises multiple block level elements (ie. lines),
* or false if it is only a single line.
*/
function isMultiLine(node) {
let par = node
while (par.parent) {
par = par.parent
}
return par.firstChild != par.lastChild
}
function getTextUntilEndOrLinebreak(node) {
let currentNode = node
let text = ""
while (
currentNode &&
currentNode.type !== "softbreak" &&
currentNode.type !== "linebreak"
) {
const { literal, type } = currentNode
if (type === "text" && literal) {
let n = 0
let char = literal[n]
while (char !== " " && char !== null && n <= literal.length) {
if (char === " ") {
break
}
if (char) {
text += char
}
n += 1
char = literal[n]
}
if (char === " ") {
break
}
}
currentNode = currentNode.next
}
return text
}
const formattingChangesByNodeType = {
emph: "_",
strong: "__"
}
/**
* Returns the literal of a node an all child nodes.
*/
const innerNodeLiteral = node => {
let literal = ""
const walker = node.walker()
let step
while ((step = walker.next())) {
const currentNode = step.node
const currentNodeLiteral = currentNode.literal
if (step.entering && currentNode.type === "text" && currentNodeLiteral) {
literal += currentNodeLiteral
}
}
return literal
}
const emptyItemWithNoSiblings = node => {
return !node.prev && !node.next && !node.firstChild
}
/**
* Class that wraps commonmark, adding the ability to see whether
* a given message actually uses any markdown syntax or whether
* it's plain text.
*/
const Markdown = function(input) {
this.constructor = function(input) {
this.input = input
const parser = new commonmark.Parser()
this.parsed = parser.parse(this.input)
this.parsed = this.repairLinks(this.parsed)
};
/**
* This method is modifying the parsed AST in such a way that links are always
* properly linkified instead of sometimes being wrongly emphasised in case
* if you were to write a link like the example below:
* https://my_weird-link_domain.domain.com
* ^ this link would be parsed to something like this:
* <a href="https://my">https://my</a><b>weird-link</b><a href="https://domain.domain.com">domain.domain.com</a>
* This method makes it so the link gets properly modified to a version where it is
* not emphasised until it actually ends.
* See: https://github.com/vector-im/element-web/issues/4674
* @param parsed
*/
this.repairLinks = function(parsed) {
const walker = parsed.walker()
let event = null
let text = ""
let isInPara = false
let previousNode = null
let shouldUnlinkFormattingNode = false
while ((event = walker.next())) {
const { node } = event
if (node.type === "paragraph") {
if (event.entering) {
isInPara = true
} else {
isInPara = false
}
}
if (isInPara) {
// Clear saved string when line ends
if (
node.type === "softbreak" ||
node.type === "linebreak" ||
// Also start calculating the text from the beginning on any spaces
(node.type === "text" && node.literal === " ")
) {
text = ""
continue
}
// Break up text nodes on spaces, so that we don't shoot past them without resetting
if (node.type === "text" && node.literal) {
const [thisPart, ...nextParts] = node.literal.split(/( )/)
node.literal = thisPart
text += thisPart
// Add the remaining parts as siblings
nextParts.reverse().forEach(part => {
if (part) {
const nextNode = new commonmark.Node("text")
nextNode.literal = part
node.insertAfter(nextNode)
// Make the iterator aware of the newly inserted node
walker.resumeAt(nextNode, true)
}
})
}
// We should not do this if previous node was not a textnode, as we can't combine it then.
if (
(node.type === "emph" || node.type === "strong") &&
previousNode?.type === "text"
) {
if (event.entering) {
const foundLinks = linkify.find(text)
for (const { value } of foundLinks) {
if (node?.firstChild?.literal) {
/**
* NOTE: This technically should unlink the emph node and create LINK nodes instead, adding all the next elements as siblings
* but this solution seems to work well and is hopefully slightly easier to understand too
*/
const format = formattingChangesByNodeType[node.type]
const nonEmphasizedText = `${format}${innerNodeLiteral(
node
)}${format}`
const f = getTextUntilEndOrLinebreak(node)
const newText = value + nonEmphasizedText + f
const newLinks = linkify.find(newText)
// Should always find only one link here, if it finds more it means that the algorithm is broken
if (newLinks.length === 1) {
const emphasisTextNode = new commonmark.Node("text")
emphasisTextNode.literal = nonEmphasizedText
previousNode.insertAfter(emphasisTextNode)
node.firstChild.literal = ""
event = node.walker().next()
if (event) {
// Remove `em` opening and closing nodes
node.unlink()
previousNode.insertAfter(event.node)
shouldUnlinkFormattingNode = true
}
} else {
node.error("Markdown links escaping found too many links for following text: " + text, msg);
node.error("Markdown links escaping found too many links for modified text: " + newText, msg);
}
}
}
} else {
if (shouldUnlinkFormattingNode) {
node.unlink()
shouldUnlinkFormattingNode = false
}
}
}
}
previousNode = node
}
return parsed
};
this.isPlainText = function() {
const walker = this.parsed.walker()
let ev
while ((ev = walker.next())) {
const node = ev.node
if (TEXT_NODES.indexOf(node.type) > -1) {
// definitely text
continue
} else if (node.type == "list" || node.type == "item") {
// Special handling for inputs like `+`, `*`, `-` and `2021.` which
// would otherwise be treated as a list of a single empty item.
// See https://github.com/vector-im/element-web/issues/7631
if (
node.type == "list" &&
node.firstChild &&
emptyItemWithNoSiblings(node.firstChild)
) {
// A list with a single empty item is treated as plain text.
continue
}
if (node.type == "item" && emptyItemWithNoSiblings(node)) {
// An empty list item with no sibling items is treated as plain text.
continue
}
// Everything else is actual lists and therefore not plaintext.
return false
} else if (node.type == "html_inline" || node.type == "html_block") {
// if it's an allowed html tag, we need to render it and therefore
// we will need to use HTML. If it's not allowed, it's not HTML since
// we'll just be treating it as text.
if (isAllowedHtmlTag(node)) {
return false
}
} else {
return false
}
}
return true
};
this.toHTML = function({ externalLinks = false } = {}) {
const renderer = new commonmark.HtmlRenderer({
safe: false,
// Set soft breaks to hard HTML breaks: commonmark
// puts softbreaks in for multiple lines in a blockquote,
// so if these are just newline characters then the
// block quote ends up all on one line
// (https://github.com/vector-im/element-web/issues/3154)
softbreak: "<br />"
})
// Trying to strip out the wrapping <p/> causes a lot more complication
// than it's worth, i think. For instance, this code will go and strip
// out any <p/> tag (no matter where it is in the tree) which doesn't
// contain \n's.
// On the flip side, <p/>s are quite opionated and restricted on where
// you can nest them.
//
// Let's try sending with <p/>s anyway for now, though.
const realParagraph = renderer.paragraph
renderer.paragraph = function (node, entering) {
// If there is only one top level node, just return the
// bare text: it's a single line of text and so should be
// 'inline', rather than unnecessarily wrapped in its own
// p tag. If, however, we have multiple nodes, each gets
// its own p tag to keep them as separate paragraphs.
// However, if it's a blockquote, adds a p tag anyway
// in order to avoid deviation to commonmark and unexpected
// results when parsing the formatted HTML.
if (node.parent?.type === "block_quote" || isMultiLine(node)) {
realParagraph.call(this, node, entering)
}
}
renderer.link = function (node, entering) {
const attrs = this.attrs(node)
if (entering && node.destination) {
attrs.push(["href", this.esc(node.destination)])
if (node.title) {
attrs.push(["title", this.esc(node.title)])
}
// Modified link behaviour to treat them all as external and
// thus opening in a new tab.
if (externalLinks) {
attrs.push(["target", "_blank"])
attrs.push(["rel", "noreferrer noopener"])
}
this.tag("a", attrs)
} else {
this.tag("/a")
}
}
renderer.html_inline = function (node) {
if (node.literal) {
if (isAllowedHtmlTag(node)) {
this.lit(node.literal)
} else {
this.lit(escaper(node.literal))
}
}
}
renderer.html_block = function (node) {
/*
// as with `paragraph`, we only insert line breaks
// if there are multiple lines in the markdown.
const isMultiLine = is_multi_line(node);
if (isMultiLine) this.cr();
*/
renderer.html_inline(node)
/*
if (isMultiLine) this.cr();
*/
}
return renderer.render(this.parsed)
};
/*
* Render the markdown message to plain text. That is, essentially
* just remove any backslashes escaping what would otherwise be
* markdown syntax
* (to fix https://github.com/vector-im/element-web/issues/2870).
*
* N.B. this does **NOT** render arbitrary MD to plain text - only MD
* which has no formatting. Otherwise it emits HTML(!).
*/
this.toPlaintext = function() {
const renderer = new commonmark.HtmlRenderer({ safe: false })
renderer.paragraph = function (node, entering) {
// as with toHTML, only append lines to paragraphs if there are
// multiple paragraphs
if (isMultiLine(node)) {
if (!entering && node.next) {
this.lit("\n\n")
}
}
}
renderer.html_block = function (node) {
if (node.literal) this.lit(node.literal)
if (isMultiLine(node) && node.next) this.lit("\n\n")
}
return renderer.render(this.parsed)
};
this.constructor(input);
}
try {
let md = new Markdown(msg.payload);
msg.payload = md.toHTML();
} catch(e) {
msg.payload = "ERROR";
}
return msg;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment