jfrobbins · August 7, 2014 12:40
diff --git a/octopressMD_to_pelicanMD.py b/octopressMD_to_pelicanMD.py
 #! /usr/bin/env python3
 ###
 # original author: Jon Robbins
 ####
 # This file converts markdown-formatted octopress files into markdown-formatted pelican files
 #   and also "fixes" html links by changing them to md links
 ###
 #  octopress2pelican.py
 #syntax is:
 #   ./octopress2pelican.py filename
 #   or
 #   ./octopress2pelican.py --dir /some/dirPath/
 ###
 # there are some hard-coded items in the __init__ to handle some things (links, output dir, etc)
 ###
 # this file is free to edit/modify/redistribute
 # (with no warranty expressed or implied)
 #
 # this file works for me, it may or may not work for you.
 ###


 import sys,os

 def ensure_dir(f):    
    d = os.path.dirname(f)
    print("checking if dir exists: " + d)
    if not os.path.exists(d):
        print("creating dir")
        os.makedirs(d)

 class myfile(object):

    def __init__(self, args):
        print("init object")
        #do stuff here later
        self.authors = 'Your Name'
        self.data = ''
        self.fname = ''
        self.d = ''
        self.header = {}
        self.outputDir = '/path/to/your/pelican/content/' #check for a cli flag for this later
        
        #paths to change (example is for embedded images):
        self.internalLinkPathChange = { '/images/posts/':'/images/' }
        
        #strings to replace later:
        self.replacements = { '<br />':'\n', '<br>':'\n', '<span class="fullpost"></span>':'', 'Category: ##':'tags: #', 'Posted by: jamba':'' }
        
        i = 0
        for arg in args:
            if arg == '--dir' or arg == '-d':
                self.d = args[i+1]
                self.fname = ''
                break
            else:
                self.fname = arg
                self.d = os.path.dirname(self.fname)
                break
                
        print("fname: " + self.fname)
        print("dir:" + self.d)
                    
    def readFile(self, fname, commenter=None):
        #tries to read actual file into list split by \n
        self.inFile = fname
        sfile = ''

        #check if file exists
        if os.path.isfile(str(fname)):
            #get absolute path:
            fname = os.path.abspath(fname)
            print('trying to read file: ' + fname)
            with open(fname, 'r') as f:
                print("file is opened")
                sfile = f.readlines() #read the lines into a list
                
                f.close()
                
            #assign to class var:
            self.fname = fname
        else:
            #attempt to use the fname var like an io object
            try:
                sfile = fname.readlines()
                fname.close()
                self.fname = fname.name
                self.isFile = True
            except AttributeError:
                print('file does not exist: ' + str(fname))
                return False
                
        if sfile:        
            print("file was read")
            if commenter:
                #trim comments and remove blank lines:
                sfile = [line[:line.find(commenter)] for line in sfile if line[:line.find(commenter)]]
            if sfile[0] == '\ufeff':
                #strip unicode BOM
                sfile = sfile[1:]
                
            self.data = sfile
            return True
        else:
            print('file could not be read')
            return False
            
        return True

    def returnData(self):
        return self.data
        
    def convertFiles(self):
        if not self.d:
            #only one file
            flist = [self.fname]
        else:
            #read a list of files in the directory
            flist = [ f for f in os.listdir(self.d) if os.path.isfile(os.path.join(self.d,f)) ]
            
        for f in flist:
            print(f)
            self.fname = os.path.join(self.d,f)
            if mydata.readFile(self.fname):
                print('file contents read')
                mydata.convert()
        
    def readHeader(self):   
        #jekyll header:     
        #---
        #layout: post
        #title: "Hosting some more services"
        #date: 2014-04-05 12:27:58 -0400
        #comments: true
        #categories: hosting,software,owncloud,mediagoblin,pumpio,xmpp
        #---
        
        #pelican header:
        #Title: My super title
        #Date: 2010-12-03 10:20
        #Modified: 2010-12-05 19:30
        #Category: Python
        #Tags: pelican, publishing
        #Slug: my-super-post
        #Authors: Alexis Metaireau, Conan Doyle
        #Summary: Short version for index and feeds

        #get header info
        nLine = 0
        for row in self.data:
            #octopress header begins with '---' and ends with '---'
            nLine += 1
            if row == '---\n':
                #start/end header
                break            
            
        print("header start line: " + str(nLine))
            
        #header started
        for row in self.data[nLine:]:
            print(row)
            nLine += 1
            if row == '---\n' or row == '':
                break
                
            lineVals = [s.strip() for s in row.split(":")]
            print(lineVals)
            name = lineVals[0] #delist
            name = name.lower()
            self.header[name] = lineVals[1]
            if name == 'categories':
                self.header[name] = self.header[name].replace('[','').replace(']','') #remove brackets
                #check for brackets in categories:
                #if '[' not in self.header[name]:
                #    self.header[name] = '[' + self.header[name]
                #if ']' not in self.header[name]:
                #    self.header[name] += ']'
            
        return nLine
        
    def pelicanHeader(self, fewLines):
        #pelican header:
        #Title: My super title
        #Date: 2010-12-03 10:20
        #Modified: 2010-12-05 19:30
        #Category: Python
        #Tags: pelican, publishing
        #Slug: my-super-post
        #Authors: Alexis Metaireau, Conan Doyle
        #Summary: Short version for index and feeds
        #
        
        #breakdown:
        #title=title
        #date=date
        #modified=date
        #category=?
        #tags=categories
        
        summaryText = ''
        for row in fewLines:      
            if summaryText != '':
                break      
            row = self.parseContents(row)
            for l in row.split('\n'):
                #print("line:" + l)
                if l == '' or l == '\n' or l == '---\n':
                    pass
                    #skip
                else:
                    summaryText = l
                    break
        
        #print("summary: " + summaryText)
        #print("endsummary")
        
        #some adjustments:
        date = self.header['date'][:10]
        self.header['title'] = self.header['title'].replace('-',' ').replace('"','')
        slug = self.header['title'].lower().replace(' ', '-').replace('"','qt')
        self.header['title'] = self.header['title'].replace('quot','"')
        
        if not date in self.header['date']:
            date = self.header['date'][:10]
        
        hdr = ''
        hdr += "Title: " + self.header['title'] + '\n' #some titles have no spaces
        hdr += "Date: " + date + '\n'
        hdr += "Modified: " + date + '\n'
        hdr += "Category: " + 'archive' + '\n'            
        hdr += "Tags: " + self.header['categories'] + '\n'
        hdr += "Slug: " + slug + '\n' #slugs can have no spaces/quotes,etc
        hdr += "Authors: " + self.authors + '\n'
        #hdr += "Summary: " + summaryText + '\n'
        
        return hdr
        
    def changeLineLinksToMarkdown(self, row):
        prefx       = '<a href='
        linkClose   = '">'
        postfx      = '</a>'
        
        if not prefx in row:
            return row        
        #<a href="http://www.thisisalink.com/foo/bar.html">this is some link text</a>
        #to
        #[this is some link text](http://www.thisisalink.com/foo/bar.html)
        
        pos = 0        
        while True:
            pos = row.find(prefx,pos)
            if pos < 0:
                break
                
            htmlText = row[pos:row.find(postfx,pos) + len(postfx)]
            
            link = htmlText[htmlText.find(prefx)+len(prefx)+1:htmlText.find(linkClose)]
            linkText = htmlText[htmlText.find(linkClose)+len(linkClose):htmlText.find(postfx)]
            
            mdText = '[' + linkText + '](' + link + ')'
            
            row = row.replace(htmlText, mdText)
                                
        return row
        
    def correctInternalLinks(self, row):          
        #{% img /images/posts/irssi_setup.png [My Irssi Setup [my irssi setup]] %}      
        #{% img /images/posts/rasPiPump.jpg 'Raspberry Pi Pump (with LCD screen)' 'My Raspberry Pi Pump' %}  
        #![Alt Text]({filename}/images/han.jpg)       
        #![]]({filename}images/posts/irssi_setup.png) 
        prefx       = '{% img '
        linkClose   = [' [', " '"]
        altStart    = ['[', "' '"]
        altStop     = [']]', "'"]
        postfx      = ' %}'
        pathChange = self.internalLinkPathChange
        
        if not prefx in row:
            return row        
            
        print('correcting internal links') 
        pos = 0        
        while True:
            pos = row.find(prefx,pos)
            if pos < 0:
                break
                
            jekyllText = row[pos:row.find(postfx,pos) + len(postfx)]
            print('orig link section: ' + jekyllText)
            
            #determine which type of link is used:
            i = 0
            for lClose in linkClose:
                if lClose in jekyllText:
                    break
                i += 1
                
            print('using: ' + linkClose[i] + ', ' + altStart[i] + ', ' + altStop[i])
            
            p = jekyllText.find(prefx)
            link = jekyllText[p +len(prefx):jekyllText.find(linkClose[i])]
            print('link: ' + link)
            
            p = jekyllText.find(linkClose[i], p)
            linkText = jekyllText[p +len(linkClose[i]):jekyllText.find(postfx)]
            print('linktext: ' + linkText)
            
            p = linkText.find(altStart[i])
            print("altstart: " + str(p))
            print("altstop: " + str(linkText.find(altStop[i], p+len(altStart[i]))))
            altText = linkText[p + len(altStart[i]):linkText.find(altStop[i], p +len(altStart[i]) )]
            print('alttext: ' + altText)                
    
            linkText = linkText.replace(altStart[i] + altText + altStop[i], '') #just remove the other alt text/ redundant?
            print('linktext: ' + linkText)
                
            for oldLink, newLink in pathChange.items():
                link = link.replace(oldLink, newLink)
            
            mdText = '![' + linkText + ']({filename}' + link + ')'
            
            row = row.replace(jekyllText, mdText)
        
        return row
        
    def parseContents(self, row):
        replacements = self.replacements
        orig = ''
        new = ''
        for orig, new in replacements.items():
            #print(orig,new)
            row = row.replace(orig, new)
            
        #check for <a href type links and change to markdown [text](link)
        row = self.changeLineLinksToMarkdown(row)

        #change jekyll/octopress internal image links to pelican:
        row = self.correctInternalLinks(row)

        return row
        
    def convert(self):
        print('converting file: ' + self.fname)
            
        nLine = 0
        of = 0
        
        #get jekyll header info:
        nLine = self.readHeader()
        print("post header nline = " + str(nLine))
        
        #create new file & write pelican:
        if not self.outputDir:
            oFileName = os.path.join(self.d, 'pelican',os.path.basename(self.fname))
        else:
            oFileName = os.path.join(self.outputDir, os.path.basename(self.fname))
            
        print("odir: " + os.path.dirname(oFileName))
        ensure_dir(oFileName)
        
        print("writing output to file: " + oFileName)
        of = open(oFileName, 'w')
        
        of.write(self.pelicanHeader(self.data[nLine:nLine + 5]) + '\n')
            
        #loop through the rows of data:
        for row in self.data[nLine:]:                             
            of.write(self.parseContents(row))
            nLine += 1
            
        if of:
            of.close()
        print("done writing file")


 if __name__ == '__main__':
    if len(sys.argv) > 1:
        
        
        mydata = myfile(sys.argv[1:])
        mydata.convertFiles()
        
        
            
        print("done")
        
    else:
        print("no args, nothing to do")
	#! /usr/bin/env python3
	###
	# original author: Jon Robbins
	####
	# This file converts markdown-formatted octopress files into markdown-formatted pelican files
	# and also "fixes" html links by changing them to md links
	###
	# octopress2pelican.py
	#syntax is:
	# ./octopress2pelican.py filename
	# or
	# ./octopress2pelican.py --dir /some/dirPath/
	###
	# there are some hard-coded items in the __init__ to handle some things (links, output dir, etc)
	###
	# this file is free to edit/modify/redistribute
	# (with no warranty expressed or implied)
	#
	# this file works for me, it may or may not work for you.
	###


	import sys,os

	def ensure_dir(f):
	d = os.path.dirname(f)
	print("checking if dir exists: " + d)
	if not os.path.exists(d):
	print("creating dir")
	os.makedirs(d)

	class myfile(object):

	def __init__(self, args):
	print("init object")
	#do stuff here later
	self.authors = 'Your Name'
	self.data = ''
	self.fname = ''
	self.d = ''
	self.header = {}
	self.outputDir = '/path/to/your/pelican/content/' #check for a cli flag for this later

	#paths to change (example is for embedded images):
	self.internalLinkPathChange = { '/images/posts/':'/images/' }

	#strings to replace later:
	self.replacements = { '<br />':'\n', '<br>':'\n', '<span class="fullpost"></span>':'', 'Category: ##':'tags: #', 'Posted by: jamba':'' }

	i = 0
	for arg in args:
	if arg == '--dir' or arg == '-d':
	self.d = args[i+1]
	self.fname = ''
	break
	else:
	self.fname = arg
	self.d = os.path.dirname(self.fname)
	break

	print("fname: " + self.fname)
	print("dir:" + self.d)

	def readFile(self, fname, commenter=None):
	#tries to read actual file into list split by \n
	self.inFile = fname
	sfile = ''

	#check if file exists
	if os.path.isfile(str(fname)):
	#get absolute path:
	fname = os.path.abspath(fname)
	print('trying to read file: ' + fname)
	with open(fname, 'r') as f:
	print("file is opened")
	sfile = f.readlines() #read the lines into a list

	f.close()

	#assign to class var:
	self.fname = fname
	else:
	#attempt to use the fname var like an io object
	try:
	sfile = fname.readlines()
	fname.close()
	self.fname = fname.name
	self.isFile = True
	except AttributeError:
	print('file does not exist: ' + str(fname))
	return False

	if sfile:
	print("file was read")
	if commenter:
	#trim comments and remove blank lines:
	sfile = [line[:line.find(commenter)] for line in sfile if line[:line.find(commenter)]]
	if sfile[0] == '\ufeff':
	#strip unicode BOM
	sfile = sfile[1:]

	self.data = sfile
	return True
	else:
	print('file could not be read')
	return False

	return True

	def returnData(self):
	return self.data

	def convertFiles(self):
	if not self.d:
	#only one file
	flist = [self.fname]
	else:
	#read a list of files in the directory
	flist = [ f for f in os.listdir(self.d) if os.path.isfile(os.path.join(self.d,f)) ]

	for f in flist:
	print(f)
	self.fname = os.path.join(self.d,f)
	if mydata.readFile(self.fname):
	print('file contents read')
	mydata.convert()

	def readHeader(self):
	#jekyll header:
	#---
	#layout: post
	#title: "Hosting some more services"
	#date: 2014-04-05 12:27:58 -0400
	#comments: true
	#categories: hosting,software,owncloud,mediagoblin,pumpio,xmpp
	#---

	#pelican header:
	#Title: My super title
	#Date: 2010-12-03 10:20
	#Modified: 2010-12-05 19:30
	#Category: Python
	#Tags: pelican, publishing
	#Slug: my-super-post
	#Authors: Alexis Metaireau, Conan Doyle
	#Summary: Short version for index and feeds

	#get header info
	nLine = 0
	for row in self.data:
	#octopress header begins with '---' and ends with '---'
	nLine += 1
	if row == '---\n':
	#start/end header
	break

	print("header start line: " + str(nLine))

	#header started
	for row in self.data[nLine:]:
	print(row)
	nLine += 1
	if row == '---\n' or row == '':
	break

	lineVals = [s.strip() for s in row.split(":")]
	print(lineVals)
	name = lineVals[0] #delist
	name = name.lower()
	self.header[name] = lineVals[1]
	if name == 'categories':
	self.header[name] = self.header[name].replace('[','').replace(']','') #remove brackets
	#check for brackets in categories:
	#if '[' not in self.header[name]:
	# self.header[name] = '[' + self.header[name]
	#if ']' not in self.header[name]:
	# self.header[name] += ']'

	return nLine

	def pelicanHeader(self, fewLines):
	#pelican header:
	#Title: My super title
	#Date: 2010-12-03 10:20
	#Modified: 2010-12-05 19:30
	#Category: Python
	#Tags: pelican, publishing
	#Slug: my-super-post
	#Authors: Alexis Metaireau, Conan Doyle
	#Summary: Short version for index and feeds
	#

	#breakdown:
	#title=title
	#date=date
	#modified=date
	#category=?
	#tags=categories

	summaryText = ''
	for row in fewLines:
	if summaryText != '':
	break
	row = self.parseContents(row)
	for l in row.split('\n'):
	#print("line:" + l)
	if l == '' or l == '\n' or l == '---\n':
	pass
	#skip
	else:
	summaryText = l
	break

	#print("summary: " + summaryText)
	#print("endsummary")

	#some adjustments:
	date = self.header['date'][:10]
	self.header['title'] = self.header['title'].replace('-',' ').replace('"','')
	slug = self.header['title'].lower().replace(' ', '-').replace('"','qt')
	self.header['title'] = self.header['title'].replace('quot','"')

	if not date in self.header['date']:
	date = self.header['date'][:10]

	hdr = ''
	hdr += "Title: " + self.header['title'] + '\n' #some titles have no spaces
	hdr += "Date: " + date + '\n'
	hdr += "Modified: " + date + '\n'
	hdr += "Category: " + 'archive' + '\n'
	hdr += "Tags: " + self.header['categories'] + '\n'
	hdr += "Slug: " + slug + '\n' #slugs can have no spaces/quotes,etc
	hdr += "Authors: " + self.authors + '\n'
	#hdr += "Summary: " + summaryText + '\n'

	return hdr

	def changeLineLinksToMarkdown(self, row):
	prefx = '<a href='
	linkClose = '">'
	postfx = '</a>'

	if not prefx in row:
	return row
	#<a href="http://www.thisisalink.com/foo/bar.html">this is some link text</a>
	#to
	#[this is some link text](http://www.thisisalink.com/foo/bar.html)

	pos = 0
	while True:
	pos = row.find(prefx,pos)
	if pos < 0:
	break

	htmlText = row[pos:row.find(postfx,pos) + len(postfx)]

	link = htmlText[htmlText.find(prefx)+len(prefx)+1:htmlText.find(linkClose)]
	linkText = htmlText[htmlText.find(linkClose)+len(linkClose):htmlText.find(postfx)]

	mdText = '[' + linkText + '](' + link + ')'

	row = row.replace(htmlText, mdText)

	return row

	def correctInternalLinks(self, row):
	#{% img /images/posts/irssi_setup.png [My Irssi Setup [my irssi setup]] %}
	#{% img /images/posts/rasPiPump.jpg 'Raspberry Pi Pump (with LCD screen)' 'My Raspberry Pi Pump' %}
	#![Alt Text]({filename}/images/han.jpg)
	#![]]({filename}images/posts/irssi_setup.png)
	prefx = '{% img '
	linkClose = [' [', " '"]
	altStart = ['[', "' '"]
	altStop = [']]', "'"]
	postfx = ' %}'
	pathChange = self.internalLinkPathChange

	if not prefx in row:
	return row

	print('correcting internal links')
	pos = 0
	while True:
	pos = row.find(prefx,pos)
	if pos < 0:
	break

	jekyllText = row[pos:row.find(postfx,pos) + len(postfx)]
	print('orig link section: ' + jekyllText)

	#determine which type of link is used:
	i = 0
	for lClose in linkClose:
	if lClose in jekyllText:
	break
	i += 1

	print('using: ' + linkClose[i] + ', ' + altStart[i] + ', ' + altStop[i])

	p = jekyllText.find(prefx)
	link = jekyllText[p +len(prefx):jekyllText.find(linkClose[i])]
	print('link: ' + link)

	p = jekyllText.find(linkClose[i], p)
	linkText = jekyllText[p +len(linkClose[i]):jekyllText.find(postfx)]
	print('linktext: ' + linkText)

	p = linkText.find(altStart[i])
	print("altstart: " + str(p))
	print("altstop: " + str(linkText.find(altStop[i], p+len(altStart[i]))))
	altText = linkText[p + len(altStart[i]):linkText.find(altStop[i], p +len(altStart[i]) )]
	print('alttext: ' + altText)

	linkText = linkText.replace(altStart[i] + altText + altStop[i], '') #just remove the other alt text/ redundant?
	print('linktext: ' + linkText)

	for oldLink, newLink in pathChange.items():
	link = link.replace(oldLink, newLink)

	mdText = '![' + linkText + ']({filename}' + link + ')'

	row = row.replace(jekyllText, mdText)

	return row

	def parseContents(self, row):
	replacements = self.replacements
	orig = ''
	new = ''
	for orig, new in replacements.items():
	#print(orig,new)
	row = row.replace(orig, new)

	#check for <a href type links and change to markdown [text](link)
	row = self.changeLineLinksToMarkdown(row)

	#change jekyll/octopress internal image links to pelican:
	row = self.correctInternalLinks(row)

	return row

	def convert(self):
	print('converting file: ' + self.fname)

	nLine = 0
	of = 0

	#get jekyll header info:
	nLine = self.readHeader()
	print("post header nline = " + str(nLine))

	#create new file & write pelican:
	if not self.outputDir:
	oFileName = os.path.join(self.d, 'pelican',os.path.basename(self.fname))
	else:
	oFileName = os.path.join(self.outputDir, os.path.basename(self.fname))

	print("odir: " + os.path.dirname(oFileName))
	ensure_dir(oFileName)

	print("writing output to file: " + oFileName)
	of = open(oFileName, 'w')

	of.write(self.pelicanHeader(self.data[nLine:nLine + 5]) + '\n')

	#loop through the rows of data:
	for row in self.data[nLine:]:
	of.write(self.parseContents(row))
	nLine += 1

	if of:
	of.close()
	print("done writing file")


	if __name__ == '__main__':
	if len(sys.argv) > 1:


	mydata = myfile(sys.argv[1:])
	mydata.convertFiles()



	print("done")

	else:
	print("no args, nothing to do")
No results found