Created
February 20, 2015 14:25
-
-
Save and-rom/584390d2b3acb37ddf12 to your computer and use it in GitHub Desktop.
calibre habrahabr recipe
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys, socket, os, urlparse, re, time, copy, urllib2, threading, traceback | |
from urllib import url2pathname, quote | |
from httplib import responses | |
from base64 import b64decode | |
from calibre import browser, relpath, unicode_path, fit_image | |
from calibre.constants import filesystem_encoding, iswindows | |
from calibre.utils.filenames import ascii_filename | |
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag | |
from calibre.ebooks.chardet import xml_to_unicode | |
from calibre.utils.config import OptionParser | |
from calibre.utils.logging import Log | |
from calibre.utils.magick import Image | |
from calibre.utils.magick.draw import identify_data, thumbnail | |
from calibre.utils.imghdr import what | |
import os, time, traceback, re, urlparse, sys, cStringIO | |
import ctypes # An included library with Python install. | |
import time | |
class FetchError(Exception): | |
pass | |
class closing(object): | |
'Context to automatically close something at the end of a block.' | |
def __init__(self, thing): | |
self.thing = thing | |
def __enter__(self): | |
return self.thing | |
def __exit__(self, *exc_info): | |
try: | |
self.thing.close() | |
except Exception: | |
pass | |
bad_url_counter = 0 | |
def basename(url): | |
try: | |
parts = urlparse.urlsplit(url) | |
path = url2pathname(parts.path) | |
res = os.path.basename(path) | |
except: | |
global bad_url_counter | |
bad_url_counter += 1 | |
return 'bad_url_%d.html'%bad_url_counter | |
if not os.path.splitext(res)[1]: | |
return 'index.html' | |
return res | |
def save_soup(soup, target): | |
ns = BeautifulSoup('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />') | |
nm = ns.find('meta') | |
metas = soup.findAll('meta', content=True) | |
added = False | |
for meta in metas: | |
if 'charset' in meta.get('content', '').lower(): | |
meta.replaceWith(nm) | |
added = True | |
if not added: | |
head = soup.find('head') | |
if head is not None: | |
head.insert(0, nm) | |
selfdir = os.path.dirname(target) | |
for tag in soup.findAll(['img', 'link', 'a']): | |
for key in ('src', 'href'): | |
path = tag.get(key, None) | |
if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path): | |
tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/')) | |
html = unicode(soup) | |
with open(target, 'wb') as f: | |
f.write(html.encode('utf-8')) | |
class response(str): | |
def __new__(cls, *args): | |
obj = super(response, cls).__new__(cls, *args) | |
obj.newurl = None | |
return obj | |
def default_is_link_wanted(url, tag): | |
raise NotImplementedError() | |
class RecursiveFetcher(object): | |
import sys, socket, os, urlparse, re, time, copy, urllib2, threading, traceback | |
from urllib import url2pathname, quote, urlopen | |
from httplib import responses | |
from base64 import b64decode | |
LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in | |
('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$')) | |
#ADBLOCK_FILTER = tuple(re.compile(i, re.IGNORECASE) for it in | |
# ( | |
# | |
# ) | |
# ) | |
CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE) | |
default_timeout = socket.getdefaulttimeout() # Needed here as it is used in __del__ | |
def __init__(self, options, log, image_map={}, css_map={}, job_info=None): | |
bd = options.dir | |
if not isinstance(bd, unicode): | |
bd = bd.decode(filesystem_encoding) | |
self.base_dir = os.path.abspath(os.path.expanduser(bd)) | |
if not os.path.exists(self.base_dir): | |
os.makedirs(self.base_dir) | |
self.log = log | |
self.verbose = options.verbose | |
self.timeout = options.timeout | |
self.encoding = options.encoding | |
self.browser = options.browser if hasattr(options, 'browser') else browser() | |
self.max_recursions = options.max_recursions | |
self.match_regexps = [re.compile(i, re.IGNORECASE) for i in options.match_regexps] | |
self.filter_regexps = [re.compile(i, re.IGNORECASE) for i in options.filter_regexps] | |
self.max_files = options.max_files | |
self.delay = options.delay | |
self.last_fetch_at = 0. | |
self.filemap = {} | |
self.imagemap = image_map | |
self.imagemap_lock = threading.RLock() | |
self.stylemap = css_map | |
self.image_url_processor = None | |
self.stylemap_lock = threading.RLock() | |
self.downloaded_paths = [] | |
self.current_dir = self.base_dir | |
self.files = 0 | |
self.preprocess_regexps = getattr(options, 'preprocess_regexps', []) | |
self.remove_tags = getattr(options, 'remove_tags', []) | |
self.remove_tags_after = getattr(options, 'remove_tags_after', None) | |
self.remove_tags_before = getattr(options, 'remove_tags_before', None) | |
self.keep_only_tags = getattr(options, 'keep_only_tags', []) | |
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup) | |
self.preprocess_raw_html = getattr(options, 'preprocess_raw_html', | |
lambda raw, url: raw) | |
self.prepreprocess_html_ext = getattr(options, 'skip_ad_pages', lambda soup: None) | |
self.postprocess_html_ext= getattr(options, 'postprocess_html', None) | |
self._is_link_wanted = getattr(options, 'is_link_wanted', | |
default_is_link_wanted) | |
self.compress_news_images_max_size = getattr(options, 'compress_news_images_max_size', None) | |
self.compress_news_images = getattr(options, 'compress_news_images', False) | |
self.compress_news_images_auto_size = getattr(options, 'compress_news_images_auto_size', 16) | |
self.scale_news_images = getattr(options, 'scale_news_images', None) | |
self.download_stylesheets = not options.no_stylesheets | |
self.show_progress = True | |
self.failed_links = [] | |
self.job_info = job_info | |
def get_soup(self, src, url=None): | |
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) | |
nmassage.extend(self.preprocess_regexps) | |
nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL), lambda m: '')] # Some websites have buggy doctype declarations that mess up beautifulsoup | |
# Remove comments as they can leave detritus when extracting tags leaves | |
# multiple nested comments | |
nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')) | |
usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0] | |
usrc = self.preprocess_raw_html(usrc, url) | |
soup = BeautifulSoup(usrc, markupMassage=nmassage) | |
replace = self.prepreprocess_html_ext(soup) | |
if replace is not None: | |
soup = BeautifulSoup(xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage) | |
if self.keep_only_tags: | |
body = Tag(soup, 'body') | |
try: | |
if isinstance(self.keep_only_tags, dict): | |
self.keep_only_tags = [self.keep_only_tags] | |
for spec in self.keep_only_tags: | |
for tag in soup.find('body').findAll(**spec): | |
body.insert(len(body.contents), tag) | |
soup.find('body').replaceWith(body) | |
except AttributeError: # soup has no body element | |
pass | |
def remove_beyond(tag, next): | |
while tag is not None and getattr(tag, 'name', None) != 'body': | |
after = getattr(tag, next) | |
while after is not None: | |
ns = getattr(tag, next) | |
after.extract() | |
after = ns | |
tag = tag.parent | |
if self.remove_tags_after is not None: | |
rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after | |
for spec in rt: | |
tag = soup.find(**spec) | |
remove_beyond(tag, 'nextSibling') | |
if self.remove_tags_before is not None: | |
tag = soup.find(**self.remove_tags_before) | |
remove_beyond(tag, 'previousSibling') | |
for kwds in self.remove_tags: | |
for tag in soup.findAll(**kwds): | |
tag.extract() | |
return self.preprocess_html_ext(soup) | |
def fetch_url(self, url): | |
import urllib2 | |
from StringIO import StringIO | |
import gzip | |
print 'FETCH_URL!' | |
data = None | |
self.log.debug('Fetching', url) | |
# Check for a URL pointing to the local filesystem and special case it | |
# for efficiency and robustness. Bypasses delay checking as it does not | |
# apply to local fetches. Ensures that unicode paths that are not | |
# representable in the filesystem_encoding work. | |
is_local = 0 | |
if url.startswith('file://'): | |
is_local = 7 | |
elif url.startswith('file:'): | |
is_local = 5 | |
if is_local > 0: | |
url = url[is_local:] | |
if iswindows and url.startswith('/'): | |
url = url[1:] | |
with open(url, 'rb') as f: | |
data = response(f.read()) | |
data.newurl = 'file:'+url # This is what mechanize does for | |
# local URLs | |
return data | |
delta = time.time() - self.last_fetch_at | |
if delta < self.delay: | |
time.sleep(self.delay - delta) | |
if isinstance(url, unicode): | |
url = url.encode('utf-8') | |
# Not sure is this is really needed as I think mechanize | |
# handles quoting automatically, but leaving it | |
# in case it breaks something | |
if re.search(r'\s+', url) is not None: | |
purl = list(urlparse.urlparse(url)) | |
for i in range(2, 6): | |
purl[i] = quote(purl[i]) | |
url = urlparse.urlunparse(purl) | |
open_func = getattr(self.browser, 'open_novisit', self.browser.open) | |
try: | |
print url, self.timeout | |
with closing(open_func(url, timeout=self.timeout)) as f: | |
data = response(f.read()+f.read()) | |
if url.endswith('/'): | |
if '<html' not in data: | |
print 'INVALID_HTML' | |
buf = StringIO(data) #buf = BytesIO(data) | |
d = gzip.GzipFile(fileobj=buf) | |
data = response (d.read()) | |
#print 'DATA=', data | |
data.newurl = f.geturl() | |
#request = urllib2.Request(url) #if url.endswith('/'): request.add_header('Accept-encoding', 'gzip') res = urllib2.urlopen(request) if res.info().get('Content-Encoding') == 'gzip': buf = StringIO( res.read()) f = gzip.GzipFile(fileobj=buf) data = f.read() | |
#print 'DATA=', data | |
except urllib2.URLError as err: | |
if hasattr(err, 'code') and responses.has_key(err.code): | |
raise FetchError, responses[err.code] | |
if getattr(err, 'reason', [0])[0] == 104 or \ | |
getattr(getattr(err, 'args', [None])[0], 'errno', None) in (-2, | |
-3): # Connection reset by peer or Name or service not known | |
self.log.debug('Temporary error, retrying in 1 second') | |
time.sleep(1) | |
with closing(open_func(url, timeout=self.timeout)) as f: | |
data = response(f.read()+f.read()) | |
if url.endswith('/'): | |
if '<html' not in data: | |
print 'INVALID_HTML' | |
buf = StringIO(data) #buf = BytesIO(data) | |
d = gzip.GzipFile(fileobj=buf) | |
data = response (d.read()) | |
#print 'DATA=', data | |
data.newurl = f.geturl() | |
else: | |
raise err | |
finally: | |
self.last_fetch_at = time.time() | |
return data | |
def start_fetch(self, url): | |
print 'START_FETCH!' | |
soup = BeautifulSoup(u'<a href="'+url+'" />') | |
self.log.debug('Downloading') | |
res = self.process_links(soup, url, 0, into_dir='') | |
self.log.debug(url, 'saved to', res) | |
return res | |
def is_link_ok(self, url): | |
for i in self.__class__.LINK_FILTER: | |
if i.search(url): | |
return False | |
return True | |
def is_link_wanted(self, url, tag): | |
try: | |
return self._is_link_wanted(url, tag) | |
except NotImplementedError: | |
pass | |
except: | |
return False | |
if self.filter_regexps: | |
for f in self.filter_regexps: | |
if f.search(url): | |
return False | |
if self.match_regexps: | |
for m in self.match_regexps: | |
if m.search(url): | |
return True | |
return False | |
return True | |
def process_stylesheets(self, soup, baseurl): | |
diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets')) | |
if not os.path.exists(diskpath): | |
os.mkdir(diskpath) | |
for c, tag in enumerate(soup.findAll(lambda tag: tag.name.lower()in ['link', 'style'] and tag.has_key('type') and tag['type'].lower() == 'text/css')): | |
if tag.has_key('href'): | |
iurl = tag['href'] | |
if not urlparse.urlsplit(iurl).scheme: | |
iurl = urlparse.urljoin(baseurl, iurl, False) | |
with self.stylemap_lock: | |
if self.stylemap.has_key(iurl): | |
tag['href'] = self.stylemap[iurl] | |
continue | |
try: | |
data = self.fetch_url(iurl) | |
except Exception: | |
self.log.exception('Could not fetch stylesheet ', iurl) | |
continue | |
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css') | |
with self.stylemap_lock: | |
self.stylemap[iurl] = stylepath | |
with open(stylepath, 'wb') as x: | |
x.write(data) | |
tag['href'] = stylepath | |
else: | |
for ns in tag.findAll(text=True): | |
src = str(ns) | |
m = self.__class__.CSS_IMPORT_PATTERN.search(src) | |
if m: | |
iurl = m.group(1) | |
if not urlparse.urlsplit(iurl).scheme: | |
iurl = urlparse.urljoin(baseurl, iurl, False) | |
with self.stylemap_lock: | |
if self.stylemap.has_key(iurl): | |
ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl])) | |
continue | |
try: | |
data = self.fetch_url(iurl) | |
except Exception: | |
self.log.exception('Could not fetch stylesheet ', iurl) | |
continue | |
c += 1 | |
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css') | |
with self.stylemap_lock: | |
self.stylemap[iurl] = stylepath | |
with open(stylepath, 'wb') as x: | |
x.write(data) | |
ns.replaceWith(src.replace(m.group(1), stylepath)) | |
def rescale_image(self, data): | |
orig_w, orig_h, ifmt = identify_data(data) | |
orig_data = data # save it in case compression fails | |
if self.scale_news_images is not None: | |
wmax, hmax = self.scale_news_images | |
scale, new_w, new_h = fit_image(orig_w, orig_h, wmax, hmax) | |
if scale: | |
data = thumbnail(data, new_w, new_h, compression_quality=95)[-1] | |
orig_w = new_w | |
orig_h = new_h | |
if self.compress_news_images_max_size is None: | |
if self.compress_news_images_auto_size is None: # not compressing | |
return data | |
else: | |
maxsizeb = (orig_w * orig_h)/self.compress_news_images_auto_size | |
else: | |
maxsizeb = self.compress_news_images_max_size * 1024 | |
scaled_data = data # save it in case compression fails | |
if len(scaled_data) <= maxsizeb: # no compression required | |
return scaled_data | |
img = Image() | |
quality = 95 | |
img.load(data) | |
while len(data) >= maxsizeb and quality >= 5: | |
quality -= 5 | |
img.set_compression_quality(quality) | |
data = img.export('jpg') | |
if len(data) >= len(scaled_data): # compression failed | |
return orig_data if len(orig_data) <= len(scaled_data) else scaled_data | |
if len(data) >= len(orig_data): # no improvement | |
return orig_data | |
return data | |
def process_images(self, soup, baseurl): | |
diskpath = unicode_path(os.path.join(self.current_dir, 'images')) | |
if not os.path.exists(diskpath): | |
os.mkdir(diskpath) | |
c = 0 | |
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): | |
iurl = tag['src'] | |
if iurl.startswith('data:image/'): | |
try: | |
data = b64decode(iurl.partition(',')[-1]) | |
except: | |
self.log.exception('Failed to decode embedded image') | |
continue | |
else: | |
if callable(self.image_url_processor): | |
iurl = self.image_url_processor(baseurl, iurl) | |
if not urlparse.urlsplit(iurl).scheme: | |
iurl = urlparse.urljoin(baseurl, iurl, False) | |
with self.imagemap_lock: | |
if self.imagemap.has_key(iurl): | |
tag['src'] = self.imagemap[iurl] | |
continue | |
try: | |
data = self.fetch_url(iurl) | |
if data == 'GIF89a\x01': | |
# Skip empty GIF files as PIL errors on them anyway | |
continue | |
except Exception: | |
self.log.exception('Could not fetch image ', iurl) | |
continue | |
c += 1 | |
fname = ascii_filename('img'+str(c)) | |
if isinstance(fname, unicode): | |
fname = fname.encode('ascii', 'replace') | |
itype = what(None, data) | |
if itype is None and b'<svg' in data[:1024]: | |
# SVG image | |
imgpath = os.path.join(diskpath, fname+'.svg') | |
with self.imagemap_lock: | |
self.imagemap[iurl] = imgpath | |
with open(imgpath, 'wb') as x: | |
x.write(data) | |
tag['src'] = imgpath | |
else: | |
try: | |
if itype not in {'png', 'jpg', 'jpeg'}: | |
itype = 'png' if itype == 'gif' else 'jpg' | |
im = Image() | |
im.load(data) | |
data = im.export(itype) | |
if self.compress_news_images and itype in {'jpg','jpeg'}: | |
try: | |
data = self.rescale_image(data) | |
except: | |
self.log.exception('failed to compress image '+iurl) | |
identify_data(data) | |
else: | |
identify_data(data) | |
# Moon+ apparently cannot handle .jpeg files | |
if itype == 'jpeg': | |
itype = 'jpg' | |
imgpath = os.path.join(diskpath, fname+'.'+itype) | |
with self.imagemap_lock: | |
self.imagemap[iurl] = imgpath | |
with open(imgpath, 'wb') as x: | |
x.write(data) | |
tag['src'] = imgpath | |
except: | |
traceback.print_exc() | |
continue | |
def absurl(self, baseurl, tag, key, filter=True): | |
iurl = tag[key] | |
parts = urlparse.urlsplit(iurl) | |
if not parts.netloc and not parts.path and not parts.query: | |
return None | |
if not parts.scheme: | |
iurl = urlparse.urljoin(baseurl, iurl, False) | |
if not self.is_link_ok(iurl): | |
self.log.debug('Skipping invalid link:', iurl) | |
return None | |
if filter and not self.is_link_wanted(iurl, tag): | |
self.log.debug('Filtered link: '+iurl) | |
return None | |
return iurl | |
def normurl(self, url): | |
parts = list(urlparse.urlsplit(url)) | |
parts[4] = '' | |
return urlparse.urlunsplit(parts) | |
def localize_link(self, tag, key, path): | |
parts = urlparse.urlsplit(tag[key]) | |
suffix = '#'+parts.fragment if parts.fragment else '' | |
tag[key] = path+suffix | |
def process_return_links(self, soup, baseurl): | |
for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')): | |
iurl = self.absurl(baseurl, tag, 'href') | |
if not iurl: | |
continue | |
nurl = self.normurl(iurl) | |
if self.filemap.has_key(nurl): | |
self.localize_link(tag, 'href', self.filemap[nurl]) | |
def process_links(self, soup, baseurl, recursion_level, into_dir='links'): | |
res = '' | |
diskpath = os.path.join(self.current_dir, into_dir) | |
if not os.path.exists(diskpath): | |
os.mkdir(diskpath) | |
prev_dir = self.current_dir | |
try: | |
self.current_dir = diskpath | |
tags = list(soup.findAll('a', href=True)) | |
for c, tag in enumerate(tags): | |
if self.show_progress: | |
print '.', | |
sys.stdout.flush() | |
sys.stdout.flush() | |
iurl = self.absurl(baseurl, tag, 'href', filter=recursion_level != 0) | |
if not iurl: | |
continue | |
nurl = self.normurl(iurl) | |
if self.filemap.has_key(nurl): | |
self.localize_link(tag, 'href', self.filemap[nurl]) | |
continue | |
if self.files > self.max_files: | |
return res | |
linkdir = 'link'+str(c) if into_dir else '' | |
linkdiskpath = os.path.join(diskpath, linkdir) | |
if not os.path.exists(linkdiskpath): | |
os.mkdir(linkdiskpath) | |
try: | |
self.current_dir = linkdiskpath | |
dsrc = self.fetch_url(iurl) | |
newbaseurl = dsrc.newurl | |
if len(dsrc) == 0 or \ | |
len(re.compile('<!--.*?-->', re.DOTALL).sub('', dsrc).strip()) == 0: | |
raise ValueError('No content at URL %r'%iurl) | |
if callable(self.encoding): | |
dsrc = self.encoding(dsrc) | |
elif self.encoding is not None: | |
dsrc = dsrc.decode(self.encoding, 'replace') | |
else: | |
dsrc = xml_to_unicode(dsrc, self.verbose)[0] | |
soup = self.get_soup(dsrc, url=iurl) | |
base = soup.find('base', href=True) | |
if base is not None: | |
newbaseurl = base['href'] | |
self.log.debug('Processing images...') | |
self.process_images(soup, newbaseurl) | |
if self.download_stylesheets: | |
self.process_stylesheets(soup, newbaseurl) | |
_fname = basename(iurl) | |
if not isinstance(_fname, unicode): | |
_fname.decode('latin1', 'replace') | |
_fname = _fname.encode('ascii', 'replace').replace('%', '').replace(os.sep, '') | |
_fname = ascii_filename(_fname) | |
_fname = os.path.splitext(_fname)[0][:120] + '.xhtml' | |
res = os.path.join(linkdiskpath, _fname) | |
self.downloaded_paths.append(res) | |
self.filemap[nurl] = res | |
if recursion_level < self.max_recursions: | |
self.log.debug('Processing links...') | |
self.process_links(soup, newbaseurl, recursion_level+1) | |
else: | |
self.process_return_links(soup, newbaseurl) | |
self.log.debug('Recursion limit reached. Skipping links in', iurl) | |
if callable(self.postprocess_html_ext): | |
soup = self.postprocess_html_ext(soup, | |
c==0 and recursion_level==0 and not getattr(self, 'called_first', False), | |
self.job_info) | |
if c==0 and recursion_level == 0: | |
self.called_first = True | |
save_soup(soup, res) | |
self.localize_link(tag, 'href', res) | |
except Exception: | |
self.failed_links.append((iurl, traceback.format_exc())) | |
self.log.exception('Could not fetch link', iurl) | |
finally: | |
self.current_dir = diskpath | |
self.files += 1 | |
finally: | |
self.current_dir = prev_dir | |
if self.show_progress: | |
return res | |
class BasicUserRecipe1399883053(AutomaticNewsRecipe): | |
title = u'habr' | |
oldest_article = 100 | |
max_articles_per_feed = 100 | |
auto_cleanup = False | |
remove_javascript = True | |
encoding = 'utf8' | |
simultaneous_downloads = 1 | |
timeout = 300 | |
remove_tags = [] | |
keep_only_tags = [dict(name='h2', attrs={'class':'title'}), | |
dict(name='h1', attrs={'class':'title'}), | |
dict(name='span',attrs={'class':'post_title'}), | |
dict(name='div', attrs={'class':'content html_format'}), | |
dict(name='div', attrs={'id':'comments'}), | |
dict(name='div', attrs={'class':'comments_list '}), | |
dict(name='div', attrs={'class':'comments_list'}), | |
dict(name='div', attrs={'class':'message html_format '}) | |
] | |
feeds = [(u'habr', u'http://habrahabr.ru/rss/hubs/')] | |
def parse_feeds (self): | |
feeds = BasicNewsRecipe.parse_feeds(self) | |
for feed in feeds: | |
for article in feed.articles: | |
print 'article.title is: ', article.title | |
print 'url: ', article.url | |
return feeds | |
def preprocess_html(self, soup): | |
print 'PREPROCESS_HTML' | |
#print(soup.get_text()) | |
#for t in soup.findAll('div'): | |
#print t.name | |
#print t | |
#print (soup.prettify()) | |
return soup | |
def get_article(self, candidates, best_candidate): | |
print 'GET_ARTICLE' | |
ret = self.get_article(self, candidates, best_candidate) | |
return ret | |
#def extract_readable_article(self, html, url): | |
# print 'EXTRACT_HTML' | |
# article = extract_readable_article(self, html, url) | |
# return article | |
def preprocess_raw_html_(self, raw_html, url): | |
print 'PREPROCESS_RAW_HTML' | |
#print raw_html | |
raw_html = self.preprocess_raw_html(raw_html, url) | |
if self.auto_cleanup: | |
try: | |
raw_html = self.extract_readable_article(raw_html, url) | |
except: | |
self.log.exception('Auto_cleanup_of_URL: %r failed'%url) | |
return raw_html | |
def _fetch_article(self, url, dir_, f, a, num_of_feeds): | |
#from calibre.web.fetch.simple import RecursiveFetcher | |
print 'FETCH_ARTICLE!' | |
br = self.browser | |
if self.get_browser.im_func is BasicNewsRecipe.get_browser.im_func: | |
# We are using the default get_browser, which means no need to | |
# clone | |
br = BasicNewsRecipe.get_browser(self) | |
else: | |
br = self.clone_browser(self.browser) | |
self.web2disk_options.browser = br | |
i = 0 | |
for i in range (0, 1): # 5-10 | |
i = i + 1 | |
fetcher = RecursiveFetcher(self.web2disk_options, self.log, | |
self.image_map, self.css_map, | |
(url, f, a, num_of_feeds)) | |
fetcher.browser = br | |
fetcher.base_dir = dir_ | |
fetcher.current_dir = dir_ | |
fetcher.show_progress = False | |
fetcher.image_url_processor = self.image_url_processor | |
res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links | |
print 'FETCH_PATH:', path | |
print 'RES:', res | |
print 'FAIL:', failures | |
#print 'dir_:', dir_, ' br:', br | |
#valid = False | |
#ctypes.windll.user32.MessageBoxA(0, "pause", "Your title", 1) | |
#f = open(res) | |
#lines = f.readlines() | |
#f.close() | |
#for line in lines: | |
# print 'LINE:', line | |
# if '<html>' in line: | |
# valid = True | |
# break | |
# else: | |
# continue | |
#if valid: | |
# print 'HTML_VALID:', i | |
#ctypes.windll.user32.MessageBoxA(0, "pause", "Your title", 1) | |
# break | |
#else: | |
# print 'HTML_INVALID:', i | |
#ctypes.windll.user32.MessageBoxA(0, "pause INVALID", "Your title", 1) | |
# time.sleep(10) # delays for 5 seconds | |
# os.remove(res) | |
# continue | |
if not res or not os.path.exists(res): | |
msg = _('Could not fetch article.') + ' ' | |
if self.debug: | |
msg += _('The debug traceback is available earlier in this log') | |
else: | |
msg += _('Run with -vv to see the reason') | |
raise Exception(msg) | |
return res, path, failures | |
''' | |
def _fetch_article(self, url, dird, f, a, numoffeeds): | |
print 'FETCH_ARTICLE' | |
res, path, failures = BasicNewsRecipe._fetch_article(self, url, dird, f, a, numoffeeds) | |
print 'FETCH_PATH:', path | |
print 'FETCH_RES:', res | |
print 'FETCH_FAIL:', failures | |
return res, path, failures | |
''' | |
''' | |
def parse_feeds(self): | |
from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed | |
from calibre import (browser, __appname__, iswindows, force_unicode, strftime, preferred_encoding, as_unicode) | |
from contextlib import nested, closing | |
print 'PARSE_FEEDS' | |
#Create a list of articles from the list of feeds returned by :meth:`BasicNewsRecipe.get_feeds`. | |
#Return a list of :class:`Feed` objects. | |
feeds = self.get_feeds() | |
parsed_feeds = [] | |
for obj in feeds: | |
if isinstance(obj, basestring): | |
title, url = None, obj | |
else: | |
title, url = obj | |
if url.startswith('feed://'): | |
url = 'http'+url[4:] | |
self.report_progress(0, _('Fetching feed')+' %s...'%(title if title else url)) | |
try: | |
with closing(self.browser.open(url)) as f: | |
parsed_feeds.append(feed_from_xml(f.read(), | |
title=title, | |
log=self.log, | |
oldest_article=self.oldest_article, | |
max_articles_per_feed=self.max_articles_per_feed, | |
get_article_url=self.get_article_url)) | |
if (self.delay > 0): | |
time.sleep(self.delay) | |
except Exception as err: | |
feed = Feed() | |
msg = 'Failed_feed: %s'%(title if title else url) | |
feed.populate_from_preparsed_feed(msg, []) | |
feed.description = as_unicode(err) | |
parsed_feeds.append(feed) | |
self.log.exception(msg) | |
remove = [fl for fl in parsed_feeds if len(fl) == 0 and | |
self.remove_empty_feeds] | |
for f in remove: | |
parsed_feeds.remove(f) | |
for feed in parsed_feeds: | |
for article in feed.articles[:]: | |
print 'article.title is: ', article.title | |
print 'url: ', article.url | |
print article.summary | |
return parsed_feeds | |
''' | |
''' | |
def extract_readable_article(self, html, url): | |
print 'EXTRACT_HTML' | |
#print html | |
#Extracts main article content from 'html', cleans up and returns as a (article_html, extracted_title) tuple. | |
#Based on the original readability algorithm by Arc90. | |
from calibre.ebooks.readability import readability | |
from lxml.html import (fragment_fromstring, tostring, | |
document_fromstring) | |
doc = readability.Document(html, self.log, url=url, | |
keep_elements=self.auto_cleanup_keep) | |
article_html = doc.summary() | |
#print 'article_html =', article_html | |
extracted_title = doc.title() | |
print 'doc.title =', extracted_title | |
try: | |
frag = fragment_fromstring(article_html) | |
except: | |
doc = document_fromstring(article_html) | |
frag = doc.xpath('//body')[-1] | |
if frag.tag == 'html': | |
root = frag | |
elif frag.tag == 'body': | |
root = document_fromstring( | |
u'<html><head><title>%s</title></head></html>' % | |
extracted_title) | |
root.append(frag) | |
else: | |
root = document_fromstring( | |
u'<html><head><title>%s</title></head><body/></html>' % | |
extracted_title) | |
root.xpath('//body')[0].append(frag) | |
print 'frag.tag =', frag.tag | |
body = root.xpath('//body')[0] | |
has_title = False | |
for x in body.iterdescendants(): | |
if x.text == extracted_title: | |
has_title = True | |
inline_titles = body.xpath('//h1|//h2') | |
if not has_title and not inline_titles: | |
heading = body.makeelement('h2') | |
heading.text = extracted_title | |
body.insert(0, heading) | |
raw_html = tostring(root, encoding=unicode) | |
return raw_html | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment