- start the server locally
- login into your penzu account
- execute script in chrome->dev tools->sources->snippets
it fetches all the data using your current session and sends it to local server, which saves everything in export folder
| export | |
| .env |
| from flask import Flask, make_response, request | |
| import os | |
| import re,datetime | |
| BASE_FOLDER = os.path.join(os.path.dirname(__file__), 'export') | |
| app = Flask(__name__) | |
| def isostr_to_date(s): | |
| return datetime.datetime(*map(int, re.split('[^\d]', s)[:-1])) | |
| @app.route("/save", methods=['GET', 'POST', 'OPTIONS']) | |
| def save(): | |
| response = make_response('success') | |
| if request.method == 'POST': | |
| data = request.form | |
| ftype = data.get('ftype') or 'html' | |
| title = data.get('title') | |
| sdate = data.get('date') | |
| if sdate: | |
| d = isostr_to_date(sdate) | |
| sdate = d.strftime('%Y-%m-%d') | |
| title = '%s-%s' % (sdate, title) | |
| body = data.get('body') | |
| if ftype == 'html': | |
| body = convert_to_html(body) | |
| ftype = 'txt' | |
| title = re.sub(r'[\s,-\.\/\?\']+', '_', title) | |
| name = '%s.%s' % ( title, ftype) | |
| path = os.path.join(BASE_FOLDER, name) | |
| with open(path, 'wb') as f: | |
| f.write(data.get('title').encode('utf8') + '\n\n') | |
| f.write(body.encode('utf8')) | |
| elif request.method == 'OPTIONS': | |
| #do nothing just return Access Control headers | |
| pass | |
| else: | |
| return 'use POST method to save files' | |
| response.headers['Access-Control-Allow-Origin'] = '*' #https://penzu.com' | |
| response.headers['Access-Control-Allow-Methods'] = 'POST' | |
| response.headers['Access-Control-Allow-Headers'] = "Origin, X-Requested-With, Content-Type, Accept" | |
| return response | |
| def convert_to_html(s): | |
| s = re.sub(r'<br\s*\/?>', '\n', s) | |
| for tag in ['p', 'div']: | |
| s = re.sub('<\/?%s[^>]*>' % tag, '\n', s) | |
| return update_html_unicode(s) | |
| import HTMLParser | |
| def update_html_unicode(s): | |
| regexp = "&.+?;" | |
| list_of_html = re.findall(regexp, s) #finds all html entites in page | |
| parser = HTMLParser.HTMLParser() | |
| for e in list_of_html: | |
| #for unknown reason parser.unescape didn't work for some characters on my machine | |
| #although, it should ... maybe has something to do with my platform or specific version | |
| char_code = e[2:-1] | |
| if char_code[0] == 'x': | |
| raise NotImplementedError('not implemented for hex numbers') | |
| try: | |
| unescaped = unichr(int(char_code, 10)) | |
| except ValueError: | |
| unescaped = parser.unescape(e) | |
| s = s.replace(e, unescaped) | |
| return s | |
| if __name__ == "__main__": | |
| app.run(debug=True) |
| BeautifulSoup==3.2.1 | |
| Flask==0.10.1 | |
| Jinja2==2.7.2 | |
| MarkupSafe==0.18 | |
| Werkzeug==0.9.4 | |
| argparse==1.2.1 | |
| itsdangerous==0.23 | |
| wsgiref==0.1.2 |
| (function(){ | |
| var PENZU = { | |
| link_template: '/pad/load_entry/', | |
| links: [], | |
| export: function(){ | |
| //this.exportPage($('body')); | |
| var pages = $("a.page, span.current.lmarg").map(function(n,p) { return parseInt($(p).text()) }); | |
| pages = $.makeArray(pages); | |
| var self = this; | |
| _fetchLinks(); | |
| function _fetchLinks(err){ | |
| if(pages.length){ | |
| self.linksForPage(pages.pop(), _fetchLinks); | |
| }else{ | |
| self.processLinks(self.links); | |
| } | |
| } | |
| }, | |
| linksForPage: function(pageNumber, cb){ | |
| console.log('loading links for page %d', pageNumber); | |
| var self = this; | |
| $.ajax({ | |
| url: '/entries/entries', | |
| data: {'page': pageNumber}, | |
| type: 'GET', | |
| dataType: 'json', | |
| success: function(res){ | |
| var page = $(res); | |
| var links = res.entries.map(function(e){ | |
| return self.link_template + e.id; | |
| }); | |
| self.links = $.merge(self.links, links); | |
| cb(null) | |
| //self.exportPage(page); | |
| }, | |
| error: function(){ | |
| console.error('linksForPage error'); | |
| cb('error in linksForPage ' + pageNumber) | |
| } | |
| }); | |
| }, | |
| processLinks: function(links){ | |
| //var links = getLinks(page); | |
| console.log("found %d links", links.length) | |
| _run(links.pop()); | |
| function _run(link){ | |
| console.log('processing link %s, number of links left: %d', link, links.length) | |
| extractArticle(link, function(er, data){ | |
| saveToFile(data, function(err){ | |
| if(err){ | |
| console.error(err); | |
| } | |
| else if(links.length){ | |
| _run(links.pop()); | |
| } | |
| }); | |
| }); | |
| } | |
| } | |
| } | |
| PENZU.export() | |
| // function getLinks(page){ | |
| // var links = page.find(".etitles a").map(function(){ | |
| // return $(this).attr('href'); | |
| // }); | |
| // return $.makeArray(links); | |
| // } | |
| function extractArticle(link, cb){ | |
| $.get(link, function(res){ | |
| var el = $(res) | |
| var d = new Date(el.find('#entry_date_url').text()) | |
| var data = { | |
| date: d.toISOString(), | |
| title: el.find('#entry_title').val(), | |
| body: el.find('#entry_body').text(), | |
| ftype: 'html', | |
| source: 'penzu' | |
| } | |
| cb(null, data); | |
| //console.log(data) | |
| }); | |
| } | |
| function saveToFile(data, cb){ | |
| $.ajax({ | |
| crossDomain: true, | |
| url: 'http://localhost:5000/save', | |
| type: 'POST', | |
| data: data, | |
| //dataType: 'json', | |
| success: function(){ | |
| console.log('saved'); | |
| cb(null); | |
| }, | |
| error: function(er, x){ | |
| console.warn(x); | |
| cb('cannot save data') | |
| } | |
| }); | |
| } | |
| })(); |
You can't extract the title, mabe because it is a textarea. So instead I had to extract the previous title from the 'previous' link at the bottom. Then I shifted the column in post processing.
To anyone who come here to export data from Penzu.
You can easily use Data Miner Chrome extension for that.
https://chrome.google.com/webstore/detail/data-scraper-easy-web-scr/nndknepjnldbdbepjfgmncbggmopgden