Created
August 7, 2014 12:40
-
-
Save jfrobbins/b586b3a29fed71cb212b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #! /usr/bin/env python3 | |
| ### | |
| # original author: Jon Robbins | |
| #### | |
| # This file converts markdown-formatted octopress files into markdown-formatted pelican files | |
| # and also "fixes" html links by changing them to md links | |
| ### | |
| # octopress2pelican.py | |
| #syntax is: | |
| # ./octopress2pelican.py filename | |
| # or | |
| # ./octopress2pelican.py --dir /some/dirPath/ | |
| ### | |
| # there are some hard-coded items in the __init__ to handle some things (links, output dir, etc) | |
| ### | |
| # this file is free to edit/modify/redistribute | |
| # (with no warranty expressed or implied) | |
| # | |
| # this file works for me, it may or may not work for you. | |
| ### | |
| import sys,os | |
| def ensure_dir(f): | |
| d = os.path.dirname(f) | |
| print("checking if dir exists: " + d) | |
| if not os.path.exists(d): | |
| print("creating dir") | |
| os.makedirs(d) | |
| class myfile(object): | |
| def __init__(self, args): | |
| print("init object") | |
| #do stuff here later | |
| self.authors = 'Your Name' | |
| self.data = '' | |
| self.fname = '' | |
| self.d = '' | |
| self.header = {} | |
| self.outputDir = '/path/to/your/pelican/content/' #check for a cli flag for this later | |
| #paths to change (example is for embedded images): | |
| self.internalLinkPathChange = { '/images/posts/':'/images/' } | |
| #strings to replace later: | |
| self.replacements = { '<br />':'\n', '<br>':'\n', '<span class="fullpost"></span>':'', 'Category: ##':'tags: #', 'Posted by: jamba':'' } | |
| i = 0 | |
| for arg in args: | |
| if arg == '--dir' or arg == '-d': | |
| self.d = args[i+1] | |
| self.fname = '' | |
| break | |
| else: | |
| self.fname = arg | |
| self.d = os.path.dirname(self.fname) | |
| break | |
| print("fname: " + self.fname) | |
| print("dir:" + self.d) | |
| def readFile(self, fname, commenter=None): | |
| #tries to read actual file into list split by \n | |
| self.inFile = fname | |
| sfile = '' | |
| #check if file exists | |
| if os.path.isfile(str(fname)): | |
| #get absolute path: | |
| fname = os.path.abspath(fname) | |
| print('trying to read file: ' + fname) | |
| with open(fname, 'r') as f: | |
| print("file is opened") | |
| sfile = f.readlines() #read the lines into a list | |
| f.close() | |
| #assign to class var: | |
| self.fname = fname | |
| else: | |
| #attempt to use the fname var like an io object | |
| try: | |
| sfile = fname.readlines() | |
| fname.close() | |
| self.fname = fname.name | |
| self.isFile = True | |
| except AttributeError: | |
| print('file does not exist: ' + str(fname)) | |
| return False | |
| if sfile: | |
| print("file was read") | |
| if commenter: | |
| #trim comments and remove blank lines: | |
| sfile = [line[:line.find(commenter)] for line in sfile if line[:line.find(commenter)]] | |
| if sfile[0] == '\ufeff': | |
| #strip unicode BOM | |
| sfile = sfile[1:] | |
| self.data = sfile | |
| return True | |
| else: | |
| print('file could not be read') | |
| return False | |
| return True | |
| def returnData(self): | |
| return self.data | |
| def convertFiles(self): | |
| if not self.d: | |
| #only one file | |
| flist = [self.fname] | |
| else: | |
| #read a list of files in the directory | |
| flist = [ f for f in os.listdir(self.d) if os.path.isfile(os.path.join(self.d,f)) ] | |
| for f in flist: | |
| print(f) | |
| self.fname = os.path.join(self.d,f) | |
| if mydata.readFile(self.fname): | |
| print('file contents read') | |
| mydata.convert() | |
| def readHeader(self): | |
| #jekyll header: | |
| #--- | |
| #layout: post | |
| #title: "Hosting some more services" | |
| #date: 2014-04-05 12:27:58 -0400 | |
| #comments: true | |
| #categories: hosting,software,owncloud,mediagoblin,pumpio,xmpp | |
| #--- | |
| #pelican header: | |
| #Title: My super title | |
| #Date: 2010-12-03 10:20 | |
| #Modified: 2010-12-05 19:30 | |
| #Category: Python | |
| #Tags: pelican, publishing | |
| #Slug: my-super-post | |
| #Authors: Alexis Metaireau, Conan Doyle | |
| #Summary: Short version for index and feeds | |
| #get header info | |
| nLine = 0 | |
| for row in self.data: | |
| #octopress header begins with '---' and ends with '---' | |
| nLine += 1 | |
| if row == '---\n': | |
| #start/end header | |
| break | |
| print("header start line: " + str(nLine)) | |
| #header started | |
| for row in self.data[nLine:]: | |
| print(row) | |
| nLine += 1 | |
| if row == '---\n' or row == '': | |
| break | |
| lineVals = [s.strip() for s in row.split(":")] | |
| print(lineVals) | |
| name = lineVals[0] #delist | |
| name = name.lower() | |
| self.header[name] = lineVals[1] | |
| if name == 'categories': | |
| self.header[name] = self.header[name].replace('[','').replace(']','') #remove brackets | |
| #check for brackets in categories: | |
| #if '[' not in self.header[name]: | |
| # self.header[name] = '[' + self.header[name] | |
| #if ']' not in self.header[name]: | |
| # self.header[name] += ']' | |
| return nLine | |
| def pelicanHeader(self, fewLines): | |
| #pelican header: | |
| #Title: My super title | |
| #Date: 2010-12-03 10:20 | |
| #Modified: 2010-12-05 19:30 | |
| #Category: Python | |
| #Tags: pelican, publishing | |
| #Slug: my-super-post | |
| #Authors: Alexis Metaireau, Conan Doyle | |
| #Summary: Short version for index and feeds | |
| # | |
| #breakdown: | |
| #title=title | |
| #date=date | |
| #modified=date | |
| #category=? | |
| #tags=categories | |
| summaryText = '' | |
| for row in fewLines: | |
| if summaryText != '': | |
| break | |
| row = self.parseContents(row) | |
| for l in row.split('\n'): | |
| #print("line:" + l) | |
| if l == '' or l == '\n' or l == '---\n': | |
| pass | |
| #skip | |
| else: | |
| summaryText = l | |
| break | |
| #print("summary: " + summaryText) | |
| #print("endsummary") | |
| #some adjustments: | |
| date = self.header['date'][:10] | |
| self.header['title'] = self.header['title'].replace('-',' ').replace('"','') | |
| slug = self.header['title'].lower().replace(' ', '-').replace('"','qt') | |
| self.header['title'] = self.header['title'].replace('quot','"') | |
| if not date in self.header['date']: | |
| date = self.header['date'][:10] | |
| hdr = '' | |
| hdr += "Title: " + self.header['title'] + '\n' #some titles have no spaces | |
| hdr += "Date: " + date + '\n' | |
| hdr += "Modified: " + date + '\n' | |
| hdr += "Category: " + 'archive' + '\n' | |
| hdr += "Tags: " + self.header['categories'] + '\n' | |
| hdr += "Slug: " + slug + '\n' #slugs can have no spaces/quotes,etc | |
| hdr += "Authors: " + self.authors + '\n' | |
| #hdr += "Summary: " + summaryText + '\n' | |
| return hdr | |
| def changeLineLinksToMarkdown(self, row): | |
| prefx = '<a href=' | |
| linkClose = '">' | |
| postfx = '</a>' | |
| if not prefx in row: | |
| return row | |
| #<a href="http://www.thisisalink.com/foo/bar.html">this is some link text</a> | |
| #to | |
| #[this is some link text](http://www.thisisalink.com/foo/bar.html) | |
| pos = 0 | |
| while True: | |
| pos = row.find(prefx,pos) | |
| if pos < 0: | |
| break | |
| htmlText = row[pos:row.find(postfx,pos) + len(postfx)] | |
| link = htmlText[htmlText.find(prefx)+len(prefx)+1:htmlText.find(linkClose)] | |
| linkText = htmlText[htmlText.find(linkClose)+len(linkClose):htmlText.find(postfx)] | |
| mdText = '[' + linkText + '](' + link + ')' | |
| row = row.replace(htmlText, mdText) | |
| return row | |
| def correctInternalLinks(self, row): | |
| #{% img /images/posts/irssi_setup.png [My Irssi Setup [my irssi setup]] %} | |
| #{% img /images/posts/rasPiPump.jpg 'Raspberry Pi Pump (with LCD screen)' 'My Raspberry Pi Pump' %} | |
| # | |
| #![]]({filename}images/posts/irssi_setup.png) | |
| prefx = '{% img ' | |
| linkClose = [' [', " '"] | |
| altStart = ['[', "' '"] | |
| altStop = [']]', "'"] | |
| postfx = ' %}' | |
| pathChange = self.internalLinkPathChange | |
| if not prefx in row: | |
| return row | |
| print('correcting internal links') | |
| pos = 0 | |
| while True: | |
| pos = row.find(prefx,pos) | |
| if pos < 0: | |
| break | |
| jekyllText = row[pos:row.find(postfx,pos) + len(postfx)] | |
| print('orig link section: ' + jekyllText) | |
| #determine which type of link is used: | |
| i = 0 | |
| for lClose in linkClose: | |
| if lClose in jekyllText: | |
| break | |
| i += 1 | |
| print('using: ' + linkClose[i] + ', ' + altStart[i] + ', ' + altStop[i]) | |
| p = jekyllText.find(prefx) | |
| link = jekyllText[p +len(prefx):jekyllText.find(linkClose[i])] | |
| print('link: ' + link) | |
| p = jekyllText.find(linkClose[i], p) | |
| linkText = jekyllText[p +len(linkClose[i]):jekyllText.find(postfx)] | |
| print('linktext: ' + linkText) | |
| p = linkText.find(altStart[i]) | |
| print("altstart: " + str(p)) | |
| print("altstop: " + str(linkText.find(altStop[i], p+len(altStart[i])))) | |
| altText = linkText[p + len(altStart[i]):linkText.find(altStop[i], p +len(altStart[i]) )] | |
| print('alttext: ' + altText) | |
| linkText = linkText.replace(altStart[i] + altText + altStop[i], '') #just remove the other alt text/ redundant? | |
| print('linktext: ' + linkText) | |
| for oldLink, newLink in pathChange.items(): | |
| link = link.replace(oldLink, newLink) | |
| mdText = '' | |
| row = row.replace(jekyllText, mdText) | |
| return row | |
| def parseContents(self, row): | |
| replacements = self.replacements | |
| orig = '' | |
| new = '' | |
| for orig, new in replacements.items(): | |
| #print(orig,new) | |
| row = row.replace(orig, new) | |
| #check for <a href type links and change to markdown [text](link) | |
| row = self.changeLineLinksToMarkdown(row) | |
| #change jekyll/octopress internal image links to pelican: | |
| row = self.correctInternalLinks(row) | |
| return row | |
| def convert(self): | |
| print('converting file: ' + self.fname) | |
| nLine = 0 | |
| of = 0 | |
| #get jekyll header info: | |
| nLine = self.readHeader() | |
| print("post header nline = " + str(nLine)) | |
| #create new file & write pelican: | |
| if not self.outputDir: | |
| oFileName = os.path.join(self.d, 'pelican',os.path.basename(self.fname)) | |
| else: | |
| oFileName = os.path.join(self.outputDir, os.path.basename(self.fname)) | |
| print("odir: " + os.path.dirname(oFileName)) | |
| ensure_dir(oFileName) | |
| print("writing output to file: " + oFileName) | |
| of = open(oFileName, 'w') | |
| of.write(self.pelicanHeader(self.data[nLine:nLine + 5]) + '\n') | |
| #loop through the rows of data: | |
| for row in self.data[nLine:]: | |
| of.write(self.parseContents(row)) | |
| nLine += 1 | |
| if of: | |
| of.close() | |
| print("done writing file") | |
| if __name__ == '__main__': | |
| if len(sys.argv) > 1: | |
| mydata = myfile(sys.argv[1:]) | |
| mydata.convertFiles() | |
| print("done") | |
| else: | |
| print("no args, nothing to do") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment