Skip to content

Instantly share code, notes, and snippets.

@jfrobbins
Created August 7, 2014 12:40
Show Gist options
  • Select an option

  • Save jfrobbins/b586b3a29fed71cb212b to your computer and use it in GitHub Desktop.

Select an option

Save jfrobbins/b586b3a29fed71cb212b to your computer and use it in GitHub Desktop.
#! /usr/bin/env python3
###
# original author: Jon Robbins
####
# This file converts markdown-formatted octopress files into markdown-formatted pelican files
# and also "fixes" html links by changing them to md links
###
# octopress2pelican.py
#syntax is:
# ./octopress2pelican.py filename
# or
# ./octopress2pelican.py --dir /some/dirPath/
###
# there are some hard-coded items in the __init__ to handle some things (links, output dir, etc)
###
# this file is free to edit/modify/redistribute
# (with no warranty expressed or implied)
#
# this file works for me, it may or may not work for you.
###
import sys,os
def ensure_dir(f):
d = os.path.dirname(f)
print("checking if dir exists: " + d)
if not os.path.exists(d):
print("creating dir")
os.makedirs(d)
class myfile(object):
def __init__(self, args):
print("init object")
#do stuff here later
self.authors = 'Your Name'
self.data = ''
self.fname = ''
self.d = ''
self.header = {}
self.outputDir = '/path/to/your/pelican/content/' #check for a cli flag for this later
#paths to change (example is for embedded images):
self.internalLinkPathChange = { '/images/posts/':'/images/' }
#strings to replace later:
self.replacements = { '<br />':'\n', '<br>':'\n', '<span class="fullpost"></span>':'', 'Category: ##':'tags: #', 'Posted by: jamba':'' }
i = 0
for arg in args:
if arg == '--dir' or arg == '-d':
self.d = args[i+1]
self.fname = ''
break
else:
self.fname = arg
self.d = os.path.dirname(self.fname)
break
print("fname: " + self.fname)
print("dir:" + self.d)
def readFile(self, fname, commenter=None):
#tries to read actual file into list split by \n
self.inFile = fname
sfile = ''
#check if file exists
if os.path.isfile(str(fname)):
#get absolute path:
fname = os.path.abspath(fname)
print('trying to read file: ' + fname)
with open(fname, 'r') as f:
print("file is opened")
sfile = f.readlines() #read the lines into a list
f.close()
#assign to class var:
self.fname = fname
else:
#attempt to use the fname var like an io object
try:
sfile = fname.readlines()
fname.close()
self.fname = fname.name
self.isFile = True
except AttributeError:
print('file does not exist: ' + str(fname))
return False
if sfile:
print("file was read")
if commenter:
#trim comments and remove blank lines:
sfile = [line[:line.find(commenter)] for line in sfile if line[:line.find(commenter)]]
if sfile[0] == '\ufeff':
#strip unicode BOM
sfile = sfile[1:]
self.data = sfile
return True
else:
print('file could not be read')
return False
return True
def returnData(self):
return self.data
def convertFiles(self):
if not self.d:
#only one file
flist = [self.fname]
else:
#read a list of files in the directory
flist = [ f for f in os.listdir(self.d) if os.path.isfile(os.path.join(self.d,f)) ]
for f in flist:
print(f)
self.fname = os.path.join(self.d,f)
if mydata.readFile(self.fname):
print('file contents read')
mydata.convert()
def readHeader(self):
#jekyll header:
#---
#layout: post
#title: "Hosting some more services"
#date: 2014-04-05 12:27:58 -0400
#comments: true
#categories: hosting,software,owncloud,mediagoblin,pumpio,xmpp
#---
#pelican header:
#Title: My super title
#Date: 2010-12-03 10:20
#Modified: 2010-12-05 19:30
#Category: Python
#Tags: pelican, publishing
#Slug: my-super-post
#Authors: Alexis Metaireau, Conan Doyle
#Summary: Short version for index and feeds
#get header info
nLine = 0
for row in self.data:
#octopress header begins with '---' and ends with '---'
nLine += 1
if row == '---\n':
#start/end header
break
print("header start line: " + str(nLine))
#header started
for row in self.data[nLine:]:
print(row)
nLine += 1
if row == '---\n' or row == '':
break
lineVals = [s.strip() for s in row.split(":")]
print(lineVals)
name = lineVals[0] #delist
name = name.lower()
self.header[name] = lineVals[1]
if name == 'categories':
self.header[name] = self.header[name].replace('[','').replace(']','') #remove brackets
#check for brackets in categories:
#if '[' not in self.header[name]:
# self.header[name] = '[' + self.header[name]
#if ']' not in self.header[name]:
# self.header[name] += ']'
return nLine
def pelicanHeader(self, fewLines):
#pelican header:
#Title: My super title
#Date: 2010-12-03 10:20
#Modified: 2010-12-05 19:30
#Category: Python
#Tags: pelican, publishing
#Slug: my-super-post
#Authors: Alexis Metaireau, Conan Doyle
#Summary: Short version for index and feeds
#
#breakdown:
#title=title
#date=date
#modified=date
#category=?
#tags=categories
summaryText = ''
for row in fewLines:
if summaryText != '':
break
row = self.parseContents(row)
for l in row.split('\n'):
#print("line:" + l)
if l == '' or l == '\n' or l == '---\n':
pass
#skip
else:
summaryText = l
break
#print("summary: " + summaryText)
#print("endsummary")
#some adjustments:
date = self.header['date'][:10]
self.header['title'] = self.header['title'].replace('-',' ').replace('"','')
slug = self.header['title'].lower().replace(' ', '-').replace('"','qt')
self.header['title'] = self.header['title'].replace('quot','"')
if not date in self.header['date']:
date = self.header['date'][:10]
hdr = ''
hdr += "Title: " + self.header['title'] + '\n' #some titles have no spaces
hdr += "Date: " + date + '\n'
hdr += "Modified: " + date + '\n'
hdr += "Category: " + 'archive' + '\n'
hdr += "Tags: " + self.header['categories'] + '\n'
hdr += "Slug: " + slug + '\n' #slugs can have no spaces/quotes,etc
hdr += "Authors: " + self.authors + '\n'
#hdr += "Summary: " + summaryText + '\n'
return hdr
def changeLineLinksToMarkdown(self, row):
prefx = '<a href='
linkClose = '">'
postfx = '</a>'
if not prefx in row:
return row
#<a href="http://www.thisisalink.com/foo/bar.html">this is some link text</a>
#to
#[this is some link text](http://www.thisisalink.com/foo/bar.html)
pos = 0
while True:
pos = row.find(prefx,pos)
if pos < 0:
break
htmlText = row[pos:row.find(postfx,pos) + len(postfx)]
link = htmlText[htmlText.find(prefx)+len(prefx)+1:htmlText.find(linkClose)]
linkText = htmlText[htmlText.find(linkClose)+len(linkClose):htmlText.find(postfx)]
mdText = '[' + linkText + '](' + link + ')'
row = row.replace(htmlText, mdText)
return row
def correctInternalLinks(self, row):
#{% img /images/posts/irssi_setup.png [My Irssi Setup [my irssi setup]] %}
#{% img /images/posts/rasPiPump.jpg 'Raspberry Pi Pump (with LCD screen)' 'My Raspberry Pi Pump' %}
#![Alt Text]({filename}/images/han.jpg)
#![]]({filename}images/posts/irssi_setup.png)
prefx = '{% img '
linkClose = [' [', " '"]
altStart = ['[', "' '"]
altStop = [']]', "'"]
postfx = ' %}'
pathChange = self.internalLinkPathChange
if not prefx in row:
return row
print('correcting internal links')
pos = 0
while True:
pos = row.find(prefx,pos)
if pos < 0:
break
jekyllText = row[pos:row.find(postfx,pos) + len(postfx)]
print('orig link section: ' + jekyllText)
#determine which type of link is used:
i = 0
for lClose in linkClose:
if lClose in jekyllText:
break
i += 1
print('using: ' + linkClose[i] + ', ' + altStart[i] + ', ' + altStop[i])
p = jekyllText.find(prefx)
link = jekyllText[p +len(prefx):jekyllText.find(linkClose[i])]
print('link: ' + link)
p = jekyllText.find(linkClose[i], p)
linkText = jekyllText[p +len(linkClose[i]):jekyllText.find(postfx)]
print('linktext: ' + linkText)
p = linkText.find(altStart[i])
print("altstart: " + str(p))
print("altstop: " + str(linkText.find(altStop[i], p+len(altStart[i]))))
altText = linkText[p + len(altStart[i]):linkText.find(altStop[i], p +len(altStart[i]) )]
print('alttext: ' + altText)
linkText = linkText.replace(altStart[i] + altText + altStop[i], '') #just remove the other alt text/ redundant?
print('linktext: ' + linkText)
for oldLink, newLink in pathChange.items():
link = link.replace(oldLink, newLink)
mdText = '![' + linkText + ']({filename}' + link + ')'
row = row.replace(jekyllText, mdText)
return row
def parseContents(self, row):
replacements = self.replacements
orig = ''
new = ''
for orig, new in replacements.items():
#print(orig,new)
row = row.replace(orig, new)
#check for <a href type links and change to markdown [text](link)
row = self.changeLineLinksToMarkdown(row)
#change jekyll/octopress internal image links to pelican:
row = self.correctInternalLinks(row)
return row
def convert(self):
print('converting file: ' + self.fname)
nLine = 0
of = 0
#get jekyll header info:
nLine = self.readHeader()
print("post header nline = " + str(nLine))
#create new file & write pelican:
if not self.outputDir:
oFileName = os.path.join(self.d, 'pelican',os.path.basename(self.fname))
else:
oFileName = os.path.join(self.outputDir, os.path.basename(self.fname))
print("odir: " + os.path.dirname(oFileName))
ensure_dir(oFileName)
print("writing output to file: " + oFileName)
of = open(oFileName, 'w')
of.write(self.pelicanHeader(self.data[nLine:nLine + 5]) + '\n')
#loop through the rows of data:
for row in self.data[nLine:]:
of.write(self.parseContents(row))
nLine += 1
if of:
of.close()
print("done writing file")
if __name__ == '__main__':
if len(sys.argv) > 1:
mydata = myfile(sys.argv[1:])
mydata.convertFiles()
print("done")
else:
print("no args, nothing to do")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment