romanlv · July 6, 2023 18:30 · janakact · Jul 26, 2020 · janakact · Jul 21, 2022
diff --git a/.gitignore b/.gitignore
 export
 .env
diff --git a/readme.md b/readme.md
diff --git a/main.py b/main.py
 from flask import Flask, make_response, request
 import os
 import re,datetime

 BASE_FOLDER = os.path.join(os.path.dirname(__file__), 'export')

 app = Flask(__name__)

 def isostr_to_date(s):
    return datetime.datetime(*map(int, re.split('[^\d]', s)[:-1]))    


 @app.route("/save", methods=['GET', 'POST', 'OPTIONS'])
 def save():
    response = make_response('success')

    if request.method == 'POST': 
        data = request.form 
        ftype = data.get('ftype') or 'html' 
        
        title = data.get('title')
        sdate = data.get('date')
        if sdate:
            d = isostr_to_date(sdate)
            sdate = d.strftime('%Y-%m-%d')
            title = '%s-%s' % (sdate, title)


        body  = data.get('body')
        if ftype == 'html':
            body = convert_to_html(body)
            ftype = 'txt'

        title = re.sub(r'[\s,-\.\/\?\']+', '_', title)
        name = '%s.%s' % ( title, ftype)        

        path = os.path.join(BASE_FOLDER, name)


        with open(path, 'wb') as f:
            f.write(data.get('title').encode('utf8') + '\n\n')
            f.write(body.encode('utf8'))
    
    elif request.method == 'OPTIONS':
        #do nothing just return Access Control headers
        pass 
    else:
        return 'use POST method to save files'

    response.headers['Access-Control-Allow-Origin'] = '*' #https://penzu.com'
    response.headers['Access-Control-Allow-Methods'] = 'POST'
    response.headers['Access-Control-Allow-Headers'] = "Origin, X-Requested-With, Content-Type, Accept"
    return response


 def convert_to_html(s):
    s = re.sub(r'<br\s*\/?>', '\n', s)
    for tag in ['p', 'div']:
        s = re.sub('<\/?%s[^>]*>' % tag, '\n', s)
    return update_html_unicode(s)

 import HTMLParser

 def update_html_unicode(s):
    regexp = "&.+?;" 
    list_of_html = re.findall(regexp, s) #finds all html entites in page
    parser = HTMLParser.HTMLParser()
    for e in list_of_html:
        
        #for unknown reason parser.unescape didn't work for some characters on my machine
        #although, it should ... maybe has something to do with my platform or specific version
        char_code = e[2:-1]
        if char_code[0] == 'x':
            raise NotImplementedError('not implemented for hex numbers')
        try:        
            unescaped = unichr(int(char_code, 10))
        except ValueError:
            unescaped = parser.unescape(e) 
        s = s.replace(e, unescaped)
   

    return s

 if __name__ == "__main__":
    app.run(debug=True)
diff --git a/requirements.txt b/requirements.txt
 BeautifulSoup==3.2.1
 Flask==0.10.1
 Jinja2==2.7.2
 MarkupSafe==0.18
 Werkzeug==0.9.4
 argparse==1.2.1
 itsdangerous==0.23
 wsgiref==0.1.2
diff --git a/snipet.js b/snipet.js
 (function(){


 var PENZU = {
    link_template: '/pad/load_entry/',
    links: [], 


    export: function(){
        //this.exportPage($('body'));
        var pages = $("a.page, span.current.lmarg").map(function(n,p) { return parseInt($(p).text()) }); 

        pages = $.makeArray(pages);

        var self = this; 
        _fetchLinks();

        function _fetchLinks(err){
            if(pages.length){
                self.linksForPage(pages.pop(), _fetchLinks);
            }else{
                self.processLinks(self.links);
            }
        }
    },
    linksForPage: function(pageNumber, cb){
        console.log('loading links for page %d', pageNumber);
        var self = this;
        $.ajax({
            url: '/entries/entries', 
            data: {'page': pageNumber},
            type: 'GET', 
            dataType: 'json',

            success: function(res){

                var page = $(res);
                var links = res.entries.map(function(e){ 
                    return self.link_template + e.id; 
                });
                self.links = $.merge(self.links, links);

                cb(null)

              //self.exportPage(page);
            }, 
            error: function(){
                console.error('linksForPage error');
                cb('error in linksForPage ' + pageNumber)
            }
        });

    },
    processLinks: function(links){
        //var links = getLinks(page);
        console.log("found %d links", links.length)
        _run(links.pop());

        function _run(link){
            console.log('processing link %s, number of links left: %d', link, links.length)
            extractArticle(link, function(er, data){

                saveToFile(data, function(err){
                    
                    if(err){
                        console.error(err);
                    }
                    else if(links.length){
                        _run(links.pop());
                    }
                    
                });             
            });
        }
    } 

 }

 PENZU.export()


 // function getLinks(page){
 //     var links = page.find(".etitles a").map(function(){ 
 //         return $(this).attr('href');
 //     });
 //     return $.makeArray(links);
 // }
 


 function extractArticle(link, cb){
    $.get(link, function(res){
        var el = $(res)
        var d = new Date(el.find('#entry_date_url').text())
        
        var data = {
            date: d.toISOString(), 
            title: el.find('#entry_title').val(), 
            body: el.find('#entry_body').text(), 
            ftype: 'html', 
            source: 'penzu'
        }    

        cb(null, data);
        
        //console.log(data)
    });
 }



 function saveToFile(data, cb){

    $.ajax({
        crossDomain: true, 
        url: 'http://localhost:5000/save', 
        type: 'POST',
        data: data, 
        //dataType: 'json', 
        success: function(){
            console.log('saved'); 
            cb(null);
        },
        error: function(er, x){ 
            console.warn(x);
            cb('cannot save data')
            
        }
    }); 

 }

 })();
	from flask import Flask, make_response, request
	import os
	import re,datetime

	BASE_FOLDER = os.path.join(os.path.dirname(__file__), 'export')

	app = Flask(__name__)

	def isostr_to_date(s):
	return datetime.datetime(*map(int, re.split('[^\d]', s)[:-1]))


	@app.route("/save", methods=['GET', 'POST', 'OPTIONS'])
	def save():
	response = make_response('success')

	if request.method == 'POST':
	data = request.form
	ftype = data.get('ftype') or 'html'

	title = data.get('title')
	sdate = data.get('date')
	if sdate:
	d = isostr_to_date(sdate)
	sdate = d.strftime('%Y-%m-%d')
	title = '%s-%s' % (sdate, title)


	body = data.get('body')
	if ftype == 'html':
	body = convert_to_html(body)
	ftype = 'txt'

	title = re.sub(r'[\s,-\.\/\?\']+', '_', title)
	name = '%s.%s' % ( title, ftype)

	path = os.path.join(BASE_FOLDER, name)


	with open(path, 'wb') as f:
	f.write(data.get('title').encode('utf8') + '\n\n')
	f.write(body.encode('utf8'))

	elif request.method == 'OPTIONS':
	#do nothing just return Access Control headers
	pass
	else:
	return 'use POST method to save files'

	response.headers['Access-Control-Allow-Origin'] = '*' #https://penzu.com'
	response.headers['Access-Control-Allow-Methods'] = 'POST'
	response.headers['Access-Control-Allow-Headers'] = "Origin, X-Requested-With, Content-Type, Accept"
	return response


	def convert_to_html(s):
	s = re.sub(r'<br\s*\/?>', '\n', s)
	for tag in ['p', 'div']:
	s = re.sub('<\/?%s[^>]*>' % tag, '\n', s)
	return update_html_unicode(s)

	import HTMLParser

	def update_html_unicode(s):
	regexp = "&.+?;"
	list_of_html = re.findall(regexp, s) #finds all html entites in page
	parser = HTMLParser.HTMLParser()
	for e in list_of_html:

	#for unknown reason parser.unescape didn't work for some characters on my machine
	#although, it should ... maybe has something to do with my platform or specific version
	char_code = e[2:-1]
	if char_code[0] == 'x':
	raise NotImplementedError('not implemented for hex numbers')
	try:
	unescaped = unichr(int(char_code, 10))
	except ValueError:
	unescaped = parser.unescape(e)
	s = s.replace(e, unescaped)


	return s

	if __name__ == "__main__":
	app.run(debug=True)
	BeautifulSoup==3.2.1
	Flask==0.10.1
	Jinja2==2.7.2
	MarkupSafe==0.18
	Werkzeug==0.9.4
	argparse==1.2.1
	itsdangerous==0.23
	wsgiref==0.1.2
	(function(){


	var PENZU = {
	link_template: '/pad/load_entry/',
	links: [],


	export: function(){
	//this.exportPage($('body'));
	var pages = $("a.page, span.current.lmarg").map(function(n,p) { return parseInt($(p).text()) });

	pages = $.makeArray(pages);

	var self = this;
	_fetchLinks();

	function _fetchLinks(err){
	if(pages.length){
	self.linksForPage(pages.pop(), _fetchLinks);
	}else{
	self.processLinks(self.links);
	}
	}
	},
	linksForPage: function(pageNumber, cb){
	console.log('loading links for page %d', pageNumber);
	var self = this;
	$.ajax({
	url: '/entries/entries',
	data: {'page': pageNumber},
	type: 'GET',
	dataType: 'json',

	success: function(res){

	var page = $(res);
	var links = res.entries.map(function(e){
	return self.link_template + e.id;
	});
	self.links = $.merge(self.links, links);

	cb(null)

	//self.exportPage(page);
	},
	error: function(){
	console.error('linksForPage error');
	cb('error in linksForPage ' + pageNumber)
	}
	});

	},
	processLinks: function(links){
	//var links = getLinks(page);
	console.log("found %d links", links.length)
	_run(links.pop());

	function _run(link){
	console.log('processing link %s, number of links left: %d', link, links.length)
	extractArticle(link, function(er, data){

	saveToFile(data, function(err){

	if(err){
	console.error(err);
	}
	else if(links.length){
	_run(links.pop());
	}

	});
	});
	}
	}

	}

	PENZU.export()


	// function getLinks(page){
	// var links = page.find(".etitles a").map(function(){
	// return $(this).attr('href');
	// });
	// return $.makeArray(links);
	// }



	function extractArticle(link, cb){
	$.get(link, function(res){
	var el = $(res)
	var d = new Date(el.find('#entry_date_url').text())

	var data = {
	date: d.toISOString(),
	title: el.find('#entry_title').val(),
	body: el.find('#entry_body').text(),
	ftype: 'html',
	source: 'penzu'
	}

	cb(null, data);

	//console.log(data)
	});
	}



	function saveToFile(data, cb){

	$.ajax({
	crossDomain: true,
	url: 'http://localhost:5000/save',
	type: 'POST',
	data: data,
	//dataType: 'json',
	success: function(){
	console.log('saved');
	cb(null);
	},
	error: function(er, x){
	console.warn(x);
	cb('cannot save data')

	}
	});

	}

	})();