Last active
April 29, 2019 18:23
-
-
Save clipperhouse/010d4666892807afee16ba7711b41401 to your computer and use it in GitHub Desktop.
A Go script to convert Medium export (HTML) to Markdown, for use with Hugo. It’s a one-use type of thing for me, so it ain’t beautiful.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// A quick script for converting Medium HTML files to Markdown, suitable for use in a static file generator such as Hugo or Jekyll | |
package main | |
import ( | |
"fmt" | |
"io/ioutil" | |
"log" | |
"os" | |
"path/filepath" | |
"regexp" | |
"strings" | |
"text/template" | |
"github.com/PuerkitoBio/goquery" | |
"github.com/lunny/html2md" | |
) | |
// Location of exported, unzipped Medium HTML files | |
var src = "/Users/mwsherman/medium-export" | |
// Destination for Markdown files, perhaps the content folder for Hugo or Jekyll | |
var dest = "/Users/mwsherman/tmp" | |
func main() { | |
filez, err := ioutil.ReadDir(src) | |
if err != nil { | |
panic(err) | |
} | |
for _, f := range filez { | |
if !strings.HasSuffix(f.Name(), ".html") { | |
continue | |
} | |
inpath := filepath.Join(src, f.Name()) | |
doc, err := read(inpath) | |
if err != nil { | |
log.Fatal(err) | |
} | |
post := process(doc) | |
post.Draft = strings.HasPrefix(f.Name(), "draft_") | |
if len(post.Title) > 0 && len(post.Body) > 0 { | |
outpath := filepath.Join(dest, slug(post.Title)+".md") | |
write(post, outpath) | |
} | |
} | |
} | |
type post struct { | |
Title, Author, Date, Body string | |
Draft bool | |
} | |
func nbsp(r rune) rune { | |
if r == '\u00A0' { | |
return ' ' | |
} | |
return r | |
} | |
func process(doc *goquery.Document) post { | |
title := doc.Find("title").Text() | |
date, _ := doc.Find("time").Attr("datetime") | |
author := doc.Find(".p-author.h-card").Text() | |
body := "" | |
doc.Find("div.section-inner").Each(func(i int, s *goquery.Selection) { | |
h, _ := s.Html() | |
body += html2md.Convert(h) | |
}) | |
body = strings.Map(nbsp, body) | |
redundant := fmt.Sprintf("### %s", title) // post body shouldn't repeat the title | |
if strings.HasPrefix(body, redundant) { | |
body = body[len(redundant):] | |
} | |
body = strings.TrimSpace(body) | |
p := post{ | |
Title: title, | |
Author: author, | |
Date: date, | |
Body: body, | |
} | |
return p | |
} | |
func read(path string) (*goquery.Document, error) { | |
f, err := os.Open(path) | |
if err != nil { | |
panic(err) | |
} | |
defer f.Close() | |
// Load the HTML document | |
return goquery.NewDocumentFromReader(f) | |
} | |
func write(post post, path string) { | |
f, err := os.Create(path) | |
if err != nil { | |
panic(err) | |
} | |
defer f.Close() | |
err = tmpl.Execute(f, post) | |
if err != nil { | |
panic(err) | |
} | |
} | |
var spaces = regexp.MustCompile(`[\s]+`) | |
var notallowed = regexp.MustCompile(`[^\p{L}\p{N}.\s]`) | |
var athe = regexp.MustCompile(`^(a\-|the\-)`) | |
func slug(s string) string { | |
result := s | |
result = strings.Replace(result, "%", " percent", -1) | |
result = strings.Replace(result, "#", " sharp", -1) | |
result = notallowed.ReplaceAllString(result, "") | |
result = spaces.ReplaceAllString(result, "-") | |
result = strings.ToLower(result) | |
result = athe.ReplaceAllString(result, "") | |
return result | |
} | |
var tmpl = template.Must(template.New("").Parse(`--- | |
title: "{{ .Title }}" | |
date: {{ .Date }} | |
author: "{{ .Author }}" | |
{{ if eq .Draft true }}draft: {{ .Draft }}{{end}} | |
--- | |
{{ .Body }} | |
`)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment