Last active
June 30, 2022 02:21
-
-
Save stecman/18b8d74fddedaf9d93d7f944cac04fc2 to your computer and use it in GitHub Desktop.
Hackaday.io article to markdown converter for migrating or backing up projects/posts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Convert hackaday posts to markdown with images stored nearby | |
# | |
# This needs the following modules to run: | |
# | |
# - https://github.com/matthewwithanm/python-markdownify | |
# - https://2.python-requests.org/en/master/ | |
# - https://www.crummy.com/software/BeautifulSoup/ | |
from bs4 import BeautifulSoup | |
from markdownify import MarkdownConverter | |
import datetime | |
import logging | |
import os | |
import re | |
import requests | |
import sys | |
class HackadayMarkdownConverter(MarkdownConverter): | |
''' | |
Modified markdown converter to handle specifics in Hackaday.io articles | |
''' | |
def process_tag(self, node, children_only=False): | |
''' | |
Remove spaces added at the start of some paragraphs | |
''' | |
return re.sub( | |
r'(^|\n)[ \t]+([^\s])', | |
r'\1\2', | |
super().process_tag(node, children_only) | |
) | |
def convert_br(self, el, text): | |
# Don't honour forced breaks | |
return '' | |
def convert_figcaption(self, el, text): | |
''' | |
Wrap figcaption text in a <caption> element to differentiate from body text | |
''' | |
if text: | |
return '<caption>' + text + '</caption>\n\n' | |
else: | |
return '' | |
def convert_table(self, el, text): | |
''' | |
Dump tables as HTML in the source | |
(This markdown converter doesn't support tables) | |
''' | |
return el.prettify() + '\n' | |
def convert_figure(self, el, text): | |
''' | |
Handle <figure> elements as block images with a possible caption | |
''' | |
md = '' | |
if el.find('img'): | |
md += self.convert_img(el.find('img'), None) + '\n\n' | |
if el.find('figcaption'): | |
captionEl = el.find('figcaption') | |
md += self.convert_figcaption(captionEl, captionEl.get_text()) | |
return md | |
def htmlToMd(html): | |
''' | |
Convert an HTML string to markdown | |
''' | |
return HackadayMarkdownConverter(heading_style='atx', bullets='-').convert(html) | |
def getLastPathSegment(url): | |
''' | |
Grab the last path segment from a URL | |
''' | |
return next(re.finditer(r'https://.*/([^/?]+)', url)).group(1) | |
def fetchArticle(url): | |
''' | |
Get the HTML content from a hackaday.io post with other primary fields | |
''' | |
req = requests.get(url) | |
rawHtml = req.content.decode('utf-8') | |
# BeautifulSoup doesn't handle all HTML entities correctly - replace them manually before reading | |
rawHtml = rawHtml.replace(''', "'") | |
html = BeautifulSoup(rawHtml, 'html.parser') | |
title = html.select_one('.headline h1') | |
content = html.select_one('.post-content') | |
# Extract publish date/time (always a US format date) | |
publishDate = datetime.datetime.strptime( | |
html.select_one('.time-card').get_text(), | |
'%m/%d/%Y at %H:%M' | |
) | |
return { | |
'metadata': { | |
'date': publishDate, | |
'slug': getLastPathSegment(url), | |
'original_url': url, | |
}, | |
'title': title.get_text(), | |
'content': content, | |
} | |
def findImageUrl(htmlNode): | |
''' | |
Given an HTML image node, return the best URL for the content | |
Hackaday images are usually lazy loaded using the URL from data-src | |
''' | |
attributes = ['src', 'data-src'] | |
for attr in attributes: | |
if attr in htmlNode.attrs: | |
return htmlNode.attrs[attr] | |
raise Exception('Failed to find src attribute for image node: ' + str(htmlNode)) | |
def downloadFile(url): | |
''' | |
Download a file to disk using the filename from the URL | |
Returns the filename of the downloaded file | |
''' | |
outputFile = getLastPathSegment(url) | |
logging.info('Saving file %s as %s' % (url, outputFile)) | |
req = requests.get(url, allow_redirects=True) | |
open(outputFile, 'wb').write(req.content) | |
return outputFile | |
def writeFrontMattter(handle, data): | |
handle.write('---\n') | |
for key in data.keys(): | |
handle.write('%s: %s\n' % (key, data[key])) | |
handle.write('---\n\n') | |
def savePost(url, force=False, keepHtml=True): | |
''' | |
Download a hackaday.io post and all of its images | |
Saves files to the current working directory | |
''' | |
source = fetchArticle(url) | |
# Strip article ID for markdown filename | |
outputName = re.sub(r'^\d+-', '', source['metadata']['slug']) | |
articlePath = outputName + '.md' | |
htmlPath = '_' + outputName + '.original.html' | |
if not force and os.path.exists(articlePath): | |
logging.info('Output file "%s" for url %s already exists!' % (articlePath, url)) | |
logging.info('Refusing to overwrite existing file without --force') | |
return | |
content = source['content'] | |
# Find and download images in the content | |
# Once downloaded the URL is replaced with a relative path to the file on disk | |
for image in content.find_all('img'): | |
image.attrs['src'] = downloadFile( findImageUrl(image) ) | |
title = '# %s\n\n' % source['title'] | |
htmlStr = content.encode(formatter='html5').decode('utf-8') | |
markdown = htmlToMd(htmlStr) | |
with open(articlePath, mode='w', encoding='utf-8') as handle: | |
writeFrontMattter(handle, source['metadata']) | |
handle.write(title) | |
handle.write(markdown) | |
# Output original HTML with image paths changed | |
if keepHtml: | |
with open(htmlPath, mode='w', encoding='utf-8') as handle: | |
handle.write(htmlStr) | |
if __name__ == '__main__': | |
import argparse | |
# Enable logging output | |
logging.basicConfig(stream=sys.stdout, level=logging.WARNING) | |
parser = argparse.ArgumentParser(description='Convert a hackaday.io post to markdown') | |
parser.add_argument('url', nargs='+', help='Post URL') | |
parser.add_argument('--force', '-f', action='store_true', help='Overwrite if the file already exists') | |
parser.add_argument('--auto-dir', '-d', action='store_true', help='Place in a directory based on the URL') | |
args = parser.parse_args() | |
for url in args.url: | |
# Sanity check URL we've been given | |
if "https://hackaday.io" not in url: | |
logging.warning('This does not look like a hackaday.io URL: ' + url) | |
if args.auto_dir: | |
# Get slug without article ID | |
slug = re.sub(r'^\d+-', '', getLastPathSegment(url)) | |
# Find current highest local dir number | |
localId = 0 | |
prefix = r'^(\d+)-' | |
for name in os.listdir(): | |
if re.match(prefix, name): | |
idPrefix = re.findall(prefix, name)[0] | |
localId = max(localId, int(idPrefix)) | |
# Increment to get the next local dir number | |
localId += 1 | |
# Create a dir in sequence from slug | |
dirname = '%03d-%s' % (localId, slug) | |
print("Downloading to dir: " + dirname) | |
os.mkdir(dirname) | |
os.chdir(dirname) | |
savePost(url, force=args.force) | |
if args.auto_dir: | |
os.chdir('..') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment