Created
March 7, 2023 17:40
-
-
Save glaforge/d8535e0f6b1e8d475fb0103184f9432d to your computer and use it in GitHub Desktop.
Transform my old blog posts into Hugo friendly Markdown article documents
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@Grab('org.jsoup:jsoup:1.15.4') | |
import org.jsoup.Jsoup | |
import org.jsoup.safety.Safelist | |
@Grab('io.github.furstenheim:copy_down:1.1') | |
import io.github.furstenheim.* | |
import java.nio.file.Paths | |
import java.nio.file.Files | |
import java.text.SimpleDateFormat | |
final sdfFrom = new SimpleDateFormat('dd MMM, yyyy', Locale.ENGLISH) | |
final sdfTo = new SimpleDateFormat('yyyy/MM/dd', Locale.ENGLISH) | |
final sdfIso = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSXXX") | |
final baseUrl = 'https://glaforge.appspot.com' | |
final baseArchivePageUrl = baseUrl + '/archives/p' | |
final pageRange = 1..49 | |
final outputFolder = "/tmp/blog-output" | |
Paths.get(outputFolder).deleteDir() | |
def articleUrls = pageRange.collectMany { int pageId -> | |
def archiveUrl = baseArchivePageUrl + pageId | |
def archivePageDoc = Jsoup.connect(archiveUrl).get() | |
archivePageDoc.select('.archive-post-title > h3 > a').collect { aTag -> | |
aTag.attr('href') | |
} | |
} | |
articleUrls.each { relArticleUrl -> | |
def fullArticleUrl = baseUrl + relArticleUrl | |
def slug = fullArticleUrl.substring(fullArticleUrl.lastIndexOf('/') + 1) | |
def articlePageDoc = Jsoup.connect(fullArticleUrl).get() | |
// get date | |
def dateBlockText = articlePageDoc.select('.post-date').text() | |
dateBlockText = dateBlockText.substring(10, dateBlockText.indexOf('(')).trim() | |
def date = sdfFrom.parse(dateBlockText) | |
def formattedDate = sdfTo.format(date) | |
def isoDate = sdfIso.format(date) | |
// get title | |
def title = articlePageDoc.select('.post-title').text() | |
// get categories | |
def tags = articlePageDoc.select('.post-meta a').collect { it.text().toLowerCase().replaceAll(' ', '-') - '-platform' } | |
// get article content | |
def articleBody = articlePageDoc.select('.post-body').first().outerHtml() | |
def safelist = Safelist.basicWithImages()//.removeTags('span') | |
def sanitizedHtml = Jsoup.clean(articleBody, fullArticleUrl, safelist) | |
// turn into markdown | |
def options = OptionsBuilder.anOptions() | |
.withHeadingStyle(HeadingStyle.ATX) | |
.withCodeBlockStyle(CodeBlockStyle.FENCED) | |
.build() | |
def toMd = new CopyDown(options) | |
def md = toMd.convert(sanitizedHtml) | |
// output with front matter | |
def mdFilePath = Files.createFile(Files.createDirectories(Paths.get(outputFolder, formattedDate)).resolve(slug + '.md')) | |
println mdFilePath | |
def frontMatter = """\ | |
--- | |
title: "${title}" | |
date: "${isoDate}" | |
tags: [${tags.join(', ')}] | |
--- | |
""".stripIndent() | |
mdFilePath << frontMatter | |
mdFilePath << md | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment