Skip to content

Instantly share code, notes, and snippets.

@and-rom
Created February 20, 2015 14:25
Show Gist options
  • Save and-rom/584390d2b3acb37ddf12 to your computer and use it in GitHub Desktop.
Save and-rom/584390d2b3acb37ddf12 to your computer and use it in GitHub Desktop.
calibre habrahabr recipe
import sys, socket, os, urlparse, re, time, copy, urllib2, threading, traceback
from urllib import url2pathname, quote
from httplib import responses
from base64 import b64decode
from calibre import browser, relpath, unicode_path, fit_image
from calibre.constants import filesystem_encoding, iswindows
from calibre.utils.filenames import ascii_filename
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.config import OptionParser
from calibre.utils.logging import Log
from calibre.utils.magick import Image
from calibre.utils.magick.draw import identify_data, thumbnail
from calibre.utils.imghdr import what
import os, time, traceback, re, urlparse, sys, cStringIO
import ctypes # An included library with Python install.
import time
class FetchError(Exception):
pass
class closing(object):
'Context to automatically close something at the end of a block.'
def __init__(self, thing):
self.thing = thing
def __enter__(self):
return self.thing
def __exit__(self, *exc_info):
try:
self.thing.close()
except Exception:
pass
bad_url_counter = 0
def basename(url):
try:
parts = urlparse.urlsplit(url)
path = url2pathname(parts.path)
res = os.path.basename(path)
except:
global bad_url_counter
bad_url_counter += 1
return 'bad_url_%d.html'%bad_url_counter
if not os.path.splitext(res)[1]:
return 'index.html'
return res
def save_soup(soup, target):
ns = BeautifulSoup('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
nm = ns.find('meta')
metas = soup.findAll('meta', content=True)
added = False
for meta in metas:
if 'charset' in meta.get('content', '').lower():
meta.replaceWith(nm)
added = True
if not added:
head = soup.find('head')
if head is not None:
head.insert(0, nm)
selfdir = os.path.dirname(target)
for tag in soup.findAll(['img', 'link', 'a']):
for key in ('src', 'href'):
path = tag.get(key, None)
if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path):
tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/'))
html = unicode(soup)
with open(target, 'wb') as f:
f.write(html.encode('utf-8'))
class response(str):
def __new__(cls, *args):
obj = super(response, cls).__new__(cls, *args)
obj.newurl = None
return obj
def default_is_link_wanted(url, tag):
raise NotImplementedError()
class RecursiveFetcher(object):
import sys, socket, os, urlparse, re, time, copy, urllib2, threading, traceback
from urllib import url2pathname, quote, urlopen
from httplib import responses
from base64 import b64decode
LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in
('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$'))
#ADBLOCK_FILTER = tuple(re.compile(i, re.IGNORECASE) for it in
# (
#
# )
# )
CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
default_timeout = socket.getdefaulttimeout() # Needed here as it is used in __del__
def __init__(self, options, log, image_map={}, css_map={}, job_info=None):
bd = options.dir
if not isinstance(bd, unicode):
bd = bd.decode(filesystem_encoding)
self.base_dir = os.path.abspath(os.path.expanduser(bd))
if not os.path.exists(self.base_dir):
os.makedirs(self.base_dir)
self.log = log
self.verbose = options.verbose
self.timeout = options.timeout
self.encoding = options.encoding
self.browser = options.browser if hasattr(options, 'browser') else browser()
self.max_recursions = options.max_recursions
self.match_regexps = [re.compile(i, re.IGNORECASE) for i in options.match_regexps]
self.filter_regexps = [re.compile(i, re.IGNORECASE) for i in options.filter_regexps]
self.max_files = options.max_files
self.delay = options.delay
self.last_fetch_at = 0.
self.filemap = {}
self.imagemap = image_map
self.imagemap_lock = threading.RLock()
self.stylemap = css_map
self.image_url_processor = None
self.stylemap_lock = threading.RLock()
self.downloaded_paths = []
self.current_dir = self.base_dir
self.files = 0
self.preprocess_regexps = getattr(options, 'preprocess_regexps', [])
self.remove_tags = getattr(options, 'remove_tags', [])
self.remove_tags_after = getattr(options, 'remove_tags_after', None)
self.remove_tags_before = getattr(options, 'remove_tags_before', None)
self.keep_only_tags = getattr(options, 'keep_only_tags', [])
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
self.preprocess_raw_html = getattr(options, 'preprocess_raw_html',
lambda raw, url: raw)
self.prepreprocess_html_ext = getattr(options, 'skip_ad_pages', lambda soup: None)
self.postprocess_html_ext= getattr(options, 'postprocess_html', None)
self._is_link_wanted = getattr(options, 'is_link_wanted',
default_is_link_wanted)
self.compress_news_images_max_size = getattr(options, 'compress_news_images_max_size', None)
self.compress_news_images = getattr(options, 'compress_news_images', False)
self.compress_news_images_auto_size = getattr(options, 'compress_news_images_auto_size', 16)
self.scale_news_images = getattr(options, 'scale_news_images', None)
self.download_stylesheets = not options.no_stylesheets
self.show_progress = True
self.failed_links = []
self.job_info = job_info
def get_soup(self, src, url=None):
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
nmassage.extend(self.preprocess_regexps)
nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL), lambda m: '')] # Some websites have buggy doctype declarations that mess up beautifulsoup
# Remove comments as they can leave detritus when extracting tags leaves
# multiple nested comments
nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
usrc = self.preprocess_raw_html(usrc, url)
soup = BeautifulSoup(usrc, markupMassage=nmassage)
replace = self.prepreprocess_html_ext(soup)
if replace is not None:
soup = BeautifulSoup(xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
if self.keep_only_tags:
body = Tag(soup, 'body')
try:
if isinstance(self.keep_only_tags, dict):
self.keep_only_tags = [self.keep_only_tags]
for spec in self.keep_only_tags:
for tag in soup.find('body').findAll(**spec):
body.insert(len(body.contents), tag)
soup.find('body').replaceWith(body)
except AttributeError: # soup has no body element
pass
def remove_beyond(tag, next):
while tag is not None and getattr(tag, 'name', None) != 'body':
after = getattr(tag, next)
while after is not None:
ns = getattr(tag, next)
after.extract()
after = ns
tag = tag.parent
if self.remove_tags_after is not None:
rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
for spec in rt:
tag = soup.find(**spec)
remove_beyond(tag, 'nextSibling')
if self.remove_tags_before is not None:
tag = soup.find(**self.remove_tags_before)
remove_beyond(tag, 'previousSibling')
for kwds in self.remove_tags:
for tag in soup.findAll(**kwds):
tag.extract()
return self.preprocess_html_ext(soup)
def fetch_url(self, url):
import urllib2
from StringIO import StringIO
import gzip
print 'FETCH_URL!'
data = None
self.log.debug('Fetching', url)
# Check for a URL pointing to the local filesystem and special case it
# for efficiency and robustness. Bypasses delay checking as it does not
# apply to local fetches. Ensures that unicode paths that are not
# representable in the filesystem_encoding work.
is_local = 0
if url.startswith('file://'):
is_local = 7
elif url.startswith('file:'):
is_local = 5
if is_local > 0:
url = url[is_local:]
if iswindows and url.startswith('/'):
url = url[1:]
with open(url, 'rb') as f:
data = response(f.read())
data.newurl = 'file:'+url # This is what mechanize does for
# local URLs
return data
delta = time.time() - self.last_fetch_at
if delta < self.delay:
time.sleep(self.delay - delta)
if isinstance(url, unicode):
url = url.encode('utf-8')
# Not sure is this is really needed as I think mechanize
# handles quoting automatically, but leaving it
# in case it breaks something
if re.search(r'\s+', url) is not None:
purl = list(urlparse.urlparse(url))
for i in range(2, 6):
purl[i] = quote(purl[i])
url = urlparse.urlunparse(purl)
open_func = getattr(self.browser, 'open_novisit', self.browser.open)
try:
print url, self.timeout
with closing(open_func(url, timeout=self.timeout)) as f:
data = response(f.read()+f.read())
if url.endswith('/'):
if '<html' not in data:
print 'INVALID_HTML'
buf = StringIO(data) #buf = BytesIO(data)
d = gzip.GzipFile(fileobj=buf)
data = response (d.read())
#print 'DATA=', data
data.newurl = f.geturl()
#request = urllib2.Request(url) #if url.endswith('/'): request.add_header('Accept-encoding', 'gzip') res = urllib2.urlopen(request) if res.info().get('Content-Encoding') == 'gzip': buf = StringIO( res.read()) f = gzip.GzipFile(fileobj=buf) data = f.read()
#print 'DATA=', data
except urllib2.URLError as err:
if hasattr(err, 'code') and responses.has_key(err.code):
raise FetchError, responses[err.code]
if getattr(err, 'reason', [0])[0] == 104 or \
getattr(getattr(err, 'args', [None])[0], 'errno', None) in (-2,
-3): # Connection reset by peer or Name or service not known
self.log.debug('Temporary error, retrying in 1 second')
time.sleep(1)
with closing(open_func(url, timeout=self.timeout)) as f:
data = response(f.read()+f.read())
if url.endswith('/'):
if '<html' not in data:
print 'INVALID_HTML'
buf = StringIO(data) #buf = BytesIO(data)
d = gzip.GzipFile(fileobj=buf)
data = response (d.read())
#print 'DATA=', data
data.newurl = f.geturl()
else:
raise err
finally:
self.last_fetch_at = time.time()
return data
def start_fetch(self, url):
print 'START_FETCH!'
soup = BeautifulSoup(u'<a href="'+url+'" />')
self.log.debug('Downloading')
res = self.process_links(soup, url, 0, into_dir='')
self.log.debug(url, 'saved to', res)
return res
def is_link_ok(self, url):
for i in self.__class__.LINK_FILTER:
if i.search(url):
return False
return True
def is_link_wanted(self, url, tag):
try:
return self._is_link_wanted(url, tag)
except NotImplementedError:
pass
except:
return False
if self.filter_regexps:
for f in self.filter_regexps:
if f.search(url):
return False
if self.match_regexps:
for m in self.match_regexps:
if m.search(url):
return True
return False
return True
def process_stylesheets(self, soup, baseurl):
diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets'))
if not os.path.exists(diskpath):
os.mkdir(diskpath)
for c, tag in enumerate(soup.findAll(lambda tag: tag.name.lower()in ['link', 'style'] and tag.has_key('type') and tag['type'].lower() == 'text/css')):
if tag.has_key('href'):
iurl = tag['href']
if not urlparse.urlsplit(iurl).scheme:
iurl = urlparse.urljoin(baseurl, iurl, False)
with self.stylemap_lock:
if self.stylemap.has_key(iurl):
tag['href'] = self.stylemap[iurl]
continue
try:
data = self.fetch_url(iurl)
except Exception:
self.log.exception('Could not fetch stylesheet ', iurl)
continue
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
with self.stylemap_lock:
self.stylemap[iurl] = stylepath
with open(stylepath, 'wb') as x:
x.write(data)
tag['href'] = stylepath
else:
for ns in tag.findAll(text=True):
src = str(ns)
m = self.__class__.CSS_IMPORT_PATTERN.search(src)
if m:
iurl = m.group(1)
if not urlparse.urlsplit(iurl).scheme:
iurl = urlparse.urljoin(baseurl, iurl, False)
with self.stylemap_lock:
if self.stylemap.has_key(iurl):
ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
continue
try:
data = self.fetch_url(iurl)
except Exception:
self.log.exception('Could not fetch stylesheet ', iurl)
continue
c += 1
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
with self.stylemap_lock:
self.stylemap[iurl] = stylepath
with open(stylepath, 'wb') as x:
x.write(data)
ns.replaceWith(src.replace(m.group(1), stylepath))
def rescale_image(self, data):
orig_w, orig_h, ifmt = identify_data(data)
orig_data = data # save it in case compression fails
if self.scale_news_images is not None:
wmax, hmax = self.scale_news_images
scale, new_w, new_h = fit_image(orig_w, orig_h, wmax, hmax)
if scale:
data = thumbnail(data, new_w, new_h, compression_quality=95)[-1]
orig_w = new_w
orig_h = new_h
if self.compress_news_images_max_size is None:
if self.compress_news_images_auto_size is None: # not compressing
return data
else:
maxsizeb = (orig_w * orig_h)/self.compress_news_images_auto_size
else:
maxsizeb = self.compress_news_images_max_size * 1024
scaled_data = data # save it in case compression fails
if len(scaled_data) <= maxsizeb: # no compression required
return scaled_data
img = Image()
quality = 95
img.load(data)
while len(data) >= maxsizeb and quality >= 5:
quality -= 5
img.set_compression_quality(quality)
data = img.export('jpg')
if len(data) >= len(scaled_data): # compression failed
return orig_data if len(orig_data) <= len(scaled_data) else scaled_data
if len(data) >= len(orig_data): # no improvement
return orig_data
return data
def process_images(self, soup, baseurl):
diskpath = unicode_path(os.path.join(self.current_dir, 'images'))
if not os.path.exists(diskpath):
os.mkdir(diskpath)
c = 0
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
if iurl.startswith('data:image/'):
try:
data = b64decode(iurl.partition(',')[-1])
except:
self.log.exception('Failed to decode embedded image')
continue
else:
if callable(self.image_url_processor):
iurl = self.image_url_processor(baseurl, iurl)
if not urlparse.urlsplit(iurl).scheme:
iurl = urlparse.urljoin(baseurl, iurl, False)
with self.imagemap_lock:
if self.imagemap.has_key(iurl):
tag['src'] = self.imagemap[iurl]
continue
try:
data = self.fetch_url(iurl)
if data == 'GIF89a\x01':
# Skip empty GIF files as PIL errors on them anyway
continue
except Exception:
self.log.exception('Could not fetch image ', iurl)
continue
c += 1
fname = ascii_filename('img'+str(c))
if isinstance(fname, unicode):
fname = fname.encode('ascii', 'replace')
itype = what(None, data)
if itype is None and b'<svg' in data[:1024]:
# SVG image
imgpath = os.path.join(diskpath, fname+'.svg')
with self.imagemap_lock:
self.imagemap[iurl] = imgpath
with open(imgpath, 'wb') as x:
x.write(data)
tag['src'] = imgpath
else:
try:
if itype not in {'png', 'jpg', 'jpeg'}:
itype = 'png' if itype == 'gif' else 'jpg'
im = Image()
im.load(data)
data = im.export(itype)
if self.compress_news_images and itype in {'jpg','jpeg'}:
try:
data = self.rescale_image(data)
except:
self.log.exception('failed to compress image '+iurl)
identify_data(data)
else:
identify_data(data)
# Moon+ apparently cannot handle .jpeg files
if itype == 'jpeg':
itype = 'jpg'
imgpath = os.path.join(diskpath, fname+'.'+itype)
with self.imagemap_lock:
self.imagemap[iurl] = imgpath
with open(imgpath, 'wb') as x:
x.write(data)
tag['src'] = imgpath
except:
traceback.print_exc()
continue
def absurl(self, baseurl, tag, key, filter=True):
iurl = tag[key]
parts = urlparse.urlsplit(iurl)
if not parts.netloc and not parts.path and not parts.query:
return None
if not parts.scheme:
iurl = urlparse.urljoin(baseurl, iurl, False)
if not self.is_link_ok(iurl):
self.log.debug('Skipping invalid link:', iurl)
return None
if filter and not self.is_link_wanted(iurl, tag):
self.log.debug('Filtered link: '+iurl)
return None
return iurl
def normurl(self, url):
parts = list(urlparse.urlsplit(url))
parts[4] = ''
return urlparse.urlunsplit(parts)
def localize_link(self, tag, key, path):
parts = urlparse.urlsplit(tag[key])
suffix = '#'+parts.fragment if parts.fragment else ''
tag[key] = path+suffix
def process_return_links(self, soup, baseurl):
for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
iurl = self.absurl(baseurl, tag, 'href')
if not iurl:
continue
nurl = self.normurl(iurl)
if self.filemap.has_key(nurl):
self.localize_link(tag, 'href', self.filemap[nurl])
def process_links(self, soup, baseurl, recursion_level, into_dir='links'):
res = ''
diskpath = os.path.join(self.current_dir, into_dir)
if not os.path.exists(diskpath):
os.mkdir(diskpath)
prev_dir = self.current_dir
try:
self.current_dir = diskpath
tags = list(soup.findAll('a', href=True))
for c, tag in enumerate(tags):
if self.show_progress:
print '.',
sys.stdout.flush()
sys.stdout.flush()
iurl = self.absurl(baseurl, tag, 'href', filter=recursion_level != 0)
if not iurl:
continue
nurl = self.normurl(iurl)
if self.filemap.has_key(nurl):
self.localize_link(tag, 'href', self.filemap[nurl])
continue
if self.files > self.max_files:
return res
linkdir = 'link'+str(c) if into_dir else ''
linkdiskpath = os.path.join(diskpath, linkdir)
if not os.path.exists(linkdiskpath):
os.mkdir(linkdiskpath)
try:
self.current_dir = linkdiskpath
dsrc = self.fetch_url(iurl)
newbaseurl = dsrc.newurl
if len(dsrc) == 0 or \
len(re.compile('<!--.*?-->', re.DOTALL).sub('', dsrc).strip()) == 0:
raise ValueError('No content at URL %r'%iurl)
if callable(self.encoding):
dsrc = self.encoding(dsrc)
elif self.encoding is not None:
dsrc = dsrc.decode(self.encoding, 'replace')
else:
dsrc = xml_to_unicode(dsrc, self.verbose)[0]
soup = self.get_soup(dsrc, url=iurl)
base = soup.find('base', href=True)
if base is not None:
newbaseurl = base['href']
self.log.debug('Processing images...')
self.process_images(soup, newbaseurl)
if self.download_stylesheets:
self.process_stylesheets(soup, newbaseurl)
_fname = basename(iurl)
if not isinstance(_fname, unicode):
_fname.decode('latin1', 'replace')
_fname = _fname.encode('ascii', 'replace').replace('%', '').replace(os.sep, '')
_fname = ascii_filename(_fname)
_fname = os.path.splitext(_fname)[0][:120] + '.xhtml'
res = os.path.join(linkdiskpath, _fname)
self.downloaded_paths.append(res)
self.filemap[nurl] = res
if recursion_level < self.max_recursions:
self.log.debug('Processing links...')
self.process_links(soup, newbaseurl, recursion_level+1)
else:
self.process_return_links(soup, newbaseurl)
self.log.debug('Recursion limit reached. Skipping links in', iurl)
if callable(self.postprocess_html_ext):
soup = self.postprocess_html_ext(soup,
c==0 and recursion_level==0 and not getattr(self, 'called_first', False),
self.job_info)
if c==0 and recursion_level == 0:
self.called_first = True
save_soup(soup, res)
self.localize_link(tag, 'href', res)
except Exception:
self.failed_links.append((iurl, traceback.format_exc()))
self.log.exception('Could not fetch link', iurl)
finally:
self.current_dir = diskpath
self.files += 1
finally:
self.current_dir = prev_dir
if self.show_progress:
print
return res
class BasicUserRecipe1399883053(AutomaticNewsRecipe):
title = u'habr'
oldest_article = 100
max_articles_per_feed = 100
auto_cleanup = False
remove_javascript = True
encoding = 'utf8'
simultaneous_downloads = 1
timeout = 300
remove_tags = []
keep_only_tags = [dict(name='h2', attrs={'class':'title'}),
dict(name='h1', attrs={'class':'title'}),
dict(name='span',attrs={'class':'post_title'}),
dict(name='div', attrs={'class':'content html_format'}),
dict(name='div', attrs={'id':'comments'}),
dict(name='div', attrs={'class':'comments_list '}),
dict(name='div', attrs={'class':'comments_list'}),
dict(name='div', attrs={'class':'message html_format '})
]
feeds = [(u'habr', u'http://habrahabr.ru/rss/hubs/')]
def parse_feeds (self):
feeds = BasicNewsRecipe.parse_feeds(self)
for feed in feeds:
for article in feed.articles:
print 'article.title is: ', article.title
print 'url: ', article.url
return feeds
def preprocess_html(self, soup):
print 'PREPROCESS_HTML'
#print(soup.get_text())
#for t in soup.findAll('div'):
#print t.name
#print t
#print (soup.prettify())
return soup
def get_article(self, candidates, best_candidate):
print 'GET_ARTICLE'
ret = self.get_article(self, candidates, best_candidate)
return ret
#def extract_readable_article(self, html, url):
# print 'EXTRACT_HTML'
# article = extract_readable_article(self, html, url)
# return article
def preprocess_raw_html_(self, raw_html, url):
print 'PREPROCESS_RAW_HTML'
#print raw_html
raw_html = self.preprocess_raw_html(raw_html, url)
if self.auto_cleanup:
try:
raw_html = self.extract_readable_article(raw_html, url)
except:
self.log.exception('Auto_cleanup_of_URL: %r failed'%url)
return raw_html
def _fetch_article(self, url, dir_, f, a, num_of_feeds):
#from calibre.web.fetch.simple import RecursiveFetcher
print 'FETCH_ARTICLE!'
br = self.browser
if self.get_browser.im_func is BasicNewsRecipe.get_browser.im_func:
# We are using the default get_browser, which means no need to
# clone
br = BasicNewsRecipe.get_browser(self)
else:
br = self.clone_browser(self.browser)
self.web2disk_options.browser = br
i = 0
for i in range (0, 1): # 5-10
i = i + 1
fetcher = RecursiveFetcher(self.web2disk_options, self.log,
self.image_map, self.css_map,
(url, f, a, num_of_feeds))
fetcher.browser = br
fetcher.base_dir = dir_
fetcher.current_dir = dir_
fetcher.show_progress = False
fetcher.image_url_processor = self.image_url_processor
res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
print 'FETCH_PATH:', path
print 'RES:', res
print 'FAIL:', failures
#print 'dir_:', dir_, ' br:', br
#valid = False
#ctypes.windll.user32.MessageBoxA(0, "pause", "Your title", 1)
#f = open(res)
#lines = f.readlines()
#f.close()
#for line in lines:
# print 'LINE:', line
# if '<html>' in line:
# valid = True
# break
# else:
# continue
#if valid:
# print 'HTML_VALID:', i
#ctypes.windll.user32.MessageBoxA(0, "pause", "Your title", 1)
# break
#else:
# print 'HTML_INVALID:', i
#ctypes.windll.user32.MessageBoxA(0, "pause INVALID", "Your title", 1)
# time.sleep(10) # delays for 5 seconds
# os.remove(res)
# continue
if not res or not os.path.exists(res):
msg = _('Could not fetch article.') + ' '
if self.debug:
msg += _('The debug traceback is available earlier in this log')
else:
msg += _('Run with -vv to see the reason')
raise Exception(msg)
return res, path, failures
'''
def _fetch_article(self, url, dird, f, a, numoffeeds):
print 'FETCH_ARTICLE'
res, path, failures = BasicNewsRecipe._fetch_article(self, url, dird, f, a, numoffeeds)
print 'FETCH_PATH:', path
print 'FETCH_RES:', res
print 'FETCH_FAIL:', failures
return res, path, failures
'''
'''
def parse_feeds(self):
from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
from calibre import (browser, __appname__, iswindows, force_unicode, strftime, preferred_encoding, as_unicode)
from contextlib import nested, closing
print 'PARSE_FEEDS'
#Create a list of articles from the list of feeds returned by :meth:`BasicNewsRecipe.get_feeds`.
#Return a list of :class:`Feed` objects.
feeds = self.get_feeds()
parsed_feeds = []
for obj in feeds:
if isinstance(obj, basestring):
title, url = None, obj
else:
title, url = obj
if url.startswith('feed://'):
url = 'http'+url[4:]
self.report_progress(0, _('Fetching feed')+' %s...'%(title if title else url))
try:
with closing(self.browser.open(url)) as f:
parsed_feeds.append(feed_from_xml(f.read(),
title=title,
log=self.log,
oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed,
get_article_url=self.get_article_url))
if (self.delay > 0):
time.sleep(self.delay)
except Exception as err:
feed = Feed()
msg = 'Failed_feed: %s'%(title if title else url)
feed.populate_from_preparsed_feed(msg, [])
feed.description = as_unicode(err)
parsed_feeds.append(feed)
self.log.exception(msg)
remove = [fl for fl in parsed_feeds if len(fl) == 0 and
self.remove_empty_feeds]
for f in remove:
parsed_feeds.remove(f)
for feed in parsed_feeds:
for article in feed.articles[:]:
print 'article.title is: ', article.title
print 'url: ', article.url
print article.summary
return parsed_feeds
'''
'''
def extract_readable_article(self, html, url):
print 'EXTRACT_HTML'
#print html
#Extracts main article content from 'html', cleans up and returns as a (article_html, extracted_title) tuple.
#Based on the original readability algorithm by Arc90.
from calibre.ebooks.readability import readability
from lxml.html import (fragment_fromstring, tostring,
document_fromstring)
doc = readability.Document(html, self.log, url=url,
keep_elements=self.auto_cleanup_keep)
article_html = doc.summary()
#print 'article_html =', article_html
extracted_title = doc.title()
print 'doc.title =', extracted_title
try:
frag = fragment_fromstring(article_html)
except:
doc = document_fromstring(article_html)
frag = doc.xpath('//body')[-1]
if frag.tag == 'html':
root = frag
elif frag.tag == 'body':
root = document_fromstring(
u'<html><head><title>%s</title></head></html>' %
extracted_title)
root.append(frag)
else:
root = document_fromstring(
u'<html><head><title>%s</title></head><body/></html>' %
extracted_title)
root.xpath('//body')[0].append(frag)
print 'frag.tag =', frag.tag
body = root.xpath('//body')[0]
has_title = False
for x in body.iterdescendants():
if x.text == extracted_title:
has_title = True
inline_titles = body.xpath('//h1|//h2')
if not has_title and not inline_titles:
heading = body.makeelement('h2')
heading.text = extracted_title
body.insert(0, heading)
raw_html = tostring(root, encoding=unicode)
return raw_html
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment