and-rom · February 20, 2015 14:25
diff --git a/ch.recipe b/ch.recipe
 import sys, socket, os, urlparse, re, time, copy, urllib2, threading, traceback
 from urllib import url2pathname, quote
 from httplib import responses
 from base64 import b64decode

 from calibre import browser, relpath, unicode_path, fit_image
 from calibre.constants import filesystem_encoding, iswindows
 from calibre.utils.filenames import ascii_filename
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.utils.config import OptionParser
 from calibre.utils.logging import Log
 from calibre.utils.magick import Image
 from calibre.utils.magick.draw import identify_data, thumbnail
 from calibre.utils.imghdr import what

 import os, time, traceback, re, urlparse, sys, cStringIO
 import ctypes  # An included library with Python install.
 import time

 class FetchError(Exception):
    pass

 class closing(object):
    'Context to automatically close something at the end of a block.'

    def __init__(self, thing):
        self.thing = thing

    def __enter__(self):
        return self.thing

    def __exit__(self, *exc_info):
        try:
            self.thing.close()
        except Exception:
            pass


 bad_url_counter = 0
 def basename(url):
    try:
        parts = urlparse.urlsplit(url)
        path = url2pathname(parts.path)
        res = os.path.basename(path)
    except:
        global bad_url_counter
        bad_url_counter += 1
        return 'bad_url_%d.html'%bad_url_counter
    if not os.path.splitext(res)[1]:
        return 'index.html'
    return res

 def save_soup(soup, target):
    ns = BeautifulSoup('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
    nm = ns.find('meta')
    metas = soup.findAll('meta', content=True)
    added = False
    for meta in metas:
        if 'charset' in meta.get('content', '').lower():
            meta.replaceWith(nm)
            added = True
    if not added:
        head = soup.find('head')
        if head is not None:
            head.insert(0, nm)

    selfdir = os.path.dirname(target)

    for tag in soup.findAll(['img', 'link', 'a']):
        for key in ('src', 'href'):
            path = tag.get(key, None)
            if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path):
                tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/'))

    html = unicode(soup)
    with open(target, 'wb') as f:
        f.write(html.encode('utf-8'))

 class response(str):

    def __new__(cls, *args):
        obj = super(response, cls).__new__(cls, *args)
        obj.newurl = None
        return obj

 def default_is_link_wanted(url, tag):
    raise NotImplementedError()
 class RecursiveFetcher(object):
        import sys, socket, os, urlparse, re, time, copy, urllib2, threading, traceback
        from urllib import url2pathname, quote, urlopen
        from httplib import responses
        from base64 import b64decode
        LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in
                    ('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$'))
        #ADBLOCK_FILTER = tuple(re.compile(i, re.IGNORECASE) for it in
        #                       (
        #
        #                        )
        #                       )
        CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
        default_timeout = socket.getdefaulttimeout() # Needed here as it is used in __del__

        def __init__(self, options, log, image_map={}, css_map={}, job_info=None):
            bd = options.dir
            if not isinstance(bd, unicode):
                bd = bd.decode(filesystem_encoding)

            self.base_dir = os.path.abspath(os.path.expanduser(bd))
            if not os.path.exists(self.base_dir):
                os.makedirs(self.base_dir)
            self.log = log
            self.verbose = options.verbose
            self.timeout = options.timeout
            self.encoding = options.encoding
            self.browser = options.browser if hasattr(options, 'browser') else browser()
            self.max_recursions = options.max_recursions
            self.match_regexps  = [re.compile(i, re.IGNORECASE) for i in options.match_regexps]
            self.filter_regexps = [re.compile(i, re.IGNORECASE) for i in options.filter_regexps]
            self.max_files = options.max_files
            self.delay = options.delay
            self.last_fetch_at = 0.
            self.filemap = {}
            self.imagemap = image_map
            self.imagemap_lock = threading.RLock()
            self.stylemap = css_map
            self.image_url_processor = None
            self.stylemap_lock = threading.RLock()
            self.downloaded_paths = []
            self.current_dir = self.base_dir
            self.files = 0
            self.preprocess_regexps  = getattr(options, 'preprocess_regexps', [])
            self.remove_tags         = getattr(options, 'remove_tags', [])
            self.remove_tags_after   = getattr(options, 'remove_tags_after', None)
            self.remove_tags_before  = getattr(options, 'remove_tags_before', None)
            self.keep_only_tags      = getattr(options, 'keep_only_tags', [])
            self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
            self.preprocess_raw_html = getattr(options, 'preprocess_raw_html',
                    lambda raw, url: raw)
            self.prepreprocess_html_ext = getattr(options, 'skip_ad_pages', lambda soup: None)
            self.postprocess_html_ext= getattr(options, 'postprocess_html', None)
            self._is_link_wanted     = getattr(options, 'is_link_wanted',
                    default_is_link_wanted)
            self.compress_news_images_max_size = getattr(options, 'compress_news_images_max_size', None)
            self.compress_news_images = getattr(options, 'compress_news_images', False)
            self.compress_news_images_auto_size = getattr(options, 'compress_news_images_auto_size', 16)
            self.scale_news_images = getattr(options, 'scale_news_images', None)
            self.download_stylesheets = not options.no_stylesheets
            self.show_progress = True
            self.failed_links = []
            self.job_info = job_info

        def get_soup(self, src, url=None):
            nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
            nmassage.extend(self.preprocess_regexps)
            nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL), lambda m: '')] # Some websites have buggy doctype declarations that mess up beautifulsoup
            # Remove comments as they can leave detritus when extracting tags leaves
            # multiple nested comments
            nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
            usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
            usrc = self.preprocess_raw_html(usrc, url)
            soup = BeautifulSoup(usrc, markupMassage=nmassage)

            replace = self.prepreprocess_html_ext(soup)
            if replace is not None:
                soup = BeautifulSoup(xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)

            if self.keep_only_tags:
                body = Tag(soup, 'body')
                try:
                    if isinstance(self.keep_only_tags, dict):
                        self.keep_only_tags = [self.keep_only_tags]
                    for spec in self.keep_only_tags:
                        for tag in soup.find('body').findAll(**spec):
                            body.insert(len(body.contents), tag)
                    soup.find('body').replaceWith(body)
                except AttributeError: # soup has no body element
                    pass

            def remove_beyond(tag, next):
                while tag is not None and getattr(tag, 'name', None) != 'body':
                    after = getattr(tag, next)
                    while after is not None:
                        ns = getattr(tag, next)
                        after.extract()
                        after = ns
                    tag = tag.parent

            if self.remove_tags_after is not None:
                rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
                for spec in rt:
                    tag = soup.find(**spec)
                    remove_beyond(tag, 'nextSibling')

            if self.remove_tags_before is not None:
                tag = soup.find(**self.remove_tags_before)
                remove_beyond(tag, 'previousSibling')

            for kwds in self.remove_tags:
                for tag in soup.findAll(**kwds):
                    tag.extract()
            return self.preprocess_html_ext(soup)


        def fetch_url(self, url):
            import urllib2
            from StringIO import StringIO
            import gzip

            print 'FETCH_URL!'
            data = None
            self.log.debug('Fetching', url)

            # Check for a URL pointing to the local filesystem and special case it
            # for efficiency and robustness. Bypasses delay checking as it does not
            # apply to local fetches. Ensures that unicode paths that are not
            # representable in the filesystem_encoding work.
            is_local = 0
            if url.startswith('file://'):
                is_local = 7
            elif url.startswith('file:'):
                is_local = 5
            if is_local > 0:
                url = url[is_local:]
                if iswindows and url.startswith('/'):
                    url = url[1:]
                with open(url, 'rb') as f:
                    data = response(f.read())
                    data.newurl = 'file:'+url # This is what mechanize does for
                                              # local URLs
                return data

            delta = time.time() - self.last_fetch_at
            if delta < self.delay:
                time.sleep(self.delay - delta)
            if isinstance(url, unicode):
                url = url.encode('utf-8')
            # Not sure is this is really needed as I think mechanize
            # handles quoting automatically, but leaving it
            # in case it breaks something
            if re.search(r'\s+', url) is not None:
                purl = list(urlparse.urlparse(url))
                for i in range(2, 6):
                    purl[i] = quote(purl[i])
                url = urlparse.urlunparse(purl)

            open_func = getattr(self.browser, 'open_novisit', self.browser.open)

            try:
                print url, self.timeout

                with closing(open_func(url, timeout=self.timeout)) as f:
                       data = response(f.read()+f.read())
                       if url.endswith('/'):
                          if '<html' not in data:
                             print 'INVALID_HTML'
                             buf = StringIO(data)  #buf = BytesIO(data)
                             d = gzip.GzipFile(fileobj=buf)
                             data = response (d.read())
                             #print 'DATA=', data
                       data.newurl = f.geturl()


                #request = urllib2.Request(url)                #if url.endswith('/'):                request.add_header('Accept-encoding', 'gzip')                res = urllib2.urlopen(request)                     if res.info().get('Content-Encoding') == 'gzip':                     buf = StringIO( res.read())                     f = gzip.GzipFile(fileobj=buf)                     data = f.read()

                #print 'DATA=', data


            except urllib2.URLError as err:
                if hasattr(err, 'code') and responses.has_key(err.code):
                    raise FetchError, responses[err.code]
                if getattr(err, 'reason', [0])[0] == 104 or \
                    getattr(getattr(err, 'args', [None])[0], 'errno', None) in (-2,
                            -3): # Connection reset by peer or Name or service not known
                    self.log.debug('Temporary error, retrying in 1 second')
                    time.sleep(1)
                    with closing(open_func(url, timeout=self.timeout)) as f:
                        data = response(f.read()+f.read())
                        if url.endswith('/'):
                          if '<html' not in data:
                             print 'INVALID_HTML'
                             buf = StringIO(data)  #buf = BytesIO(data)
                             d = gzip.GzipFile(fileobj=buf)
                             data = response (d.read())
                             #print 'DATA=', data
                        data.newurl = f.geturl()
                else:
                    raise err
            finally:
                self.last_fetch_at = time.time()
            return data


        def start_fetch(self, url):
            print 'START_FETCH!'
            soup = BeautifulSoup(u'<a href="'+url+'" />')
            self.log.debug('Downloading')
            res = self.process_links(soup, url, 0, into_dir='')
            self.log.debug(url, 'saved to', res)
            return res

        def is_link_ok(self, url):
            for i in self.__class__.LINK_FILTER:
                if i.search(url):
                    return False
            return True

        def is_link_wanted(self, url, tag):
            try:
                return self._is_link_wanted(url, tag)
            except NotImplementedError:
                pass
            except:
                return False
            if self.filter_regexps:
                for f in self.filter_regexps:
                    if f.search(url):
                        return False
            if self.match_regexps:
                for m in self.match_regexps:
                    if m.search(url):
                        return True
                return False
            return True

        def process_stylesheets(self, soup, baseurl):
            diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets'))
            if not os.path.exists(diskpath):
                os.mkdir(diskpath)
            for c, tag in enumerate(soup.findAll(lambda tag: tag.name.lower()in ['link', 'style'] and tag.has_key('type') and tag['type'].lower() == 'text/css')):
                if tag.has_key('href'):
                    iurl = tag['href']
                    if not urlparse.urlsplit(iurl).scheme:
                        iurl = urlparse.urljoin(baseurl, iurl, False)
                    with self.stylemap_lock:
                        if self.stylemap.has_key(iurl):
                            tag['href'] = self.stylemap[iurl]
                            continue
                    try:
                        data = self.fetch_url(iurl)
                    except Exception:
                        self.log.exception('Could not fetch stylesheet ', iurl)
                        continue
                    stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
                    with self.stylemap_lock:
                        self.stylemap[iurl] = stylepath
                    with open(stylepath, 'wb') as x:
                        x.write(data)
                    tag['href'] = stylepath
                else:
                    for ns in tag.findAll(text=True):
                        src = str(ns)
                        m = self.__class__.CSS_IMPORT_PATTERN.search(src)
                        if m:
                            iurl = m.group(1)
                            if not urlparse.urlsplit(iurl).scheme:
                                iurl = urlparse.urljoin(baseurl, iurl, False)
                            with self.stylemap_lock:
                                if self.stylemap.has_key(iurl):
                                    ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
                                    continue
                            try:
                                data = self.fetch_url(iurl)
                            except Exception:
                                self.log.exception('Could not fetch stylesheet ', iurl)
                                continue
                            c += 1
                            stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
                            with self.stylemap_lock:
                                self.stylemap[iurl] = stylepath
                            with open(stylepath, 'wb') as x:
                                x.write(data)
                            ns.replaceWith(src.replace(m.group(1), stylepath))

        def rescale_image(self, data):
            orig_w, orig_h, ifmt = identify_data(data)
            orig_data = data # save it in case compression fails
            if self.scale_news_images is not None:
                wmax, hmax = self.scale_news_images
                scale, new_w, new_h = fit_image(orig_w, orig_h, wmax, hmax)
                if scale:
                    data = thumbnail(data, new_w, new_h, compression_quality=95)[-1]
                    orig_w = new_w
                    orig_h = new_h
            if self.compress_news_images_max_size is None:
                if self.compress_news_images_auto_size is None: # not compressing
                    return data
                else:
                    maxsizeb = (orig_w * orig_h)/self.compress_news_images_auto_size
            else:
                maxsizeb = self.compress_news_images_max_size * 1024
            scaled_data = data # save it in case compression fails
            if len(scaled_data) <= maxsizeb: # no compression required
                return scaled_data

            img = Image()
            quality = 95
            img.load(data)
            while len(data) >= maxsizeb and quality >= 5:
                quality -= 5
                img.set_compression_quality(quality)
                data = img.export('jpg')

            if len(data) >= len(scaled_data): # compression failed
                return orig_data if len(orig_data) <= len(scaled_data) else scaled_data

            if len(data) >= len(orig_data): # no improvement
                return orig_data

            return data

        def process_images(self, soup, baseurl):
            diskpath = unicode_path(os.path.join(self.current_dir, 'images'))
            if not os.path.exists(diskpath):
                os.mkdir(diskpath)
            c = 0
            for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
                iurl = tag['src']
                if iurl.startswith('data:image/'):
                    try:
                        data = b64decode(iurl.partition(',')[-1])
                    except:
                        self.log.exception('Failed to decode embedded image')
                        continue
                else:
                    if callable(self.image_url_processor):
                        iurl = self.image_url_processor(baseurl, iurl)
                    if not urlparse.urlsplit(iurl).scheme:
                        iurl = urlparse.urljoin(baseurl, iurl, False)
                    with self.imagemap_lock:
                        if self.imagemap.has_key(iurl):
                            tag['src'] = self.imagemap[iurl]
                            continue
                    try:
                        data = self.fetch_url(iurl)
                        if data == 'GIF89a\x01':
                            # Skip empty GIF files as PIL errors on them anyway
                            continue
                    except Exception:
                        self.log.exception('Could not fetch image ', iurl)
                        continue
                c += 1
                fname = ascii_filename('img'+str(c))
                if isinstance(fname, unicode):
                    fname = fname.encode('ascii', 'replace')
                itype = what(None, data)
                if itype is None and b'<svg' in data[:1024]:
                    # SVG image
                    imgpath = os.path.join(diskpath, fname+'.svg')
                    with self.imagemap_lock:
                        self.imagemap[iurl] = imgpath
                    with open(imgpath, 'wb') as x:
                        x.write(data)
                    tag['src'] = imgpath
                else:
                    try:
                        if itype not in {'png', 'jpg', 'jpeg'}:
                            itype = 'png' if itype == 'gif' else 'jpg'
                            im = Image()
                            im.load(data)
                            data = im.export(itype)
                        if self.compress_news_images and itype in {'jpg','jpeg'}:
                            try:
                                data = self.rescale_image(data)
                            except:
                                self.log.exception('failed to compress image '+iurl)
                                identify_data(data)
                        else:
                            identify_data(data)
                        # Moon+ apparently cannot handle .jpeg files
                        if itype == 'jpeg':
                            itype = 'jpg'
                        imgpath = os.path.join(diskpath, fname+'.'+itype)
                        with self.imagemap_lock:
                            self.imagemap[iurl] = imgpath
                        with open(imgpath, 'wb') as x:
                            x.write(data)
                        tag['src'] = imgpath
                    except:
                        traceback.print_exc()
                        continue

        def absurl(self, baseurl, tag, key, filter=True):
            iurl = tag[key]
            parts = urlparse.urlsplit(iurl)
            if not parts.netloc and not parts.path and not parts.query:
                return None
            if not parts.scheme:
                iurl = urlparse.urljoin(baseurl, iurl, False)
            if not self.is_link_ok(iurl):
                self.log.debug('Skipping invalid link:', iurl)
                return None
            if filter and not self.is_link_wanted(iurl, tag):
                self.log.debug('Filtered link: '+iurl)
                return None
            return iurl

        def normurl(self, url):
            parts = list(urlparse.urlsplit(url))
            parts[4] = ''
            return urlparse.urlunsplit(parts)

        def localize_link(self, tag, key, path):
            parts = urlparse.urlsplit(tag[key])
            suffix = '#'+parts.fragment if parts.fragment else ''
            tag[key] = path+suffix

        def process_return_links(self, soup, baseurl):
            for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
                iurl = self.absurl(baseurl, tag, 'href')
                if not iurl:
                    continue
                nurl = self.normurl(iurl)
                if self.filemap.has_key(nurl):
                    self.localize_link(tag, 'href', self.filemap[nurl])

        def process_links(self, soup, baseurl, recursion_level, into_dir='links'):
            res = ''
            diskpath = os.path.join(self.current_dir, into_dir)
            if not os.path.exists(diskpath):
                os.mkdir(diskpath)
            prev_dir = self.current_dir
            try:
                self.current_dir = diskpath
                tags = list(soup.findAll('a', href=True))

                for c, tag in enumerate(tags):
                    if self.show_progress:
                        print '.',
                        sys.stdout.flush()
                    sys.stdout.flush()
                    iurl = self.absurl(baseurl, tag, 'href', filter=recursion_level != 0)
                    if not iurl:
                        continue
                    nurl = self.normurl(iurl)
                    if self.filemap.has_key(nurl):
                        self.localize_link(tag, 'href', self.filemap[nurl])
                        continue
                    if self.files > self.max_files:
                        return res
                    linkdir = 'link'+str(c) if into_dir else ''
                    linkdiskpath = os.path.join(diskpath, linkdir)
                    if not os.path.exists(linkdiskpath):
                        os.mkdir(linkdiskpath)
                    try:
                        self.current_dir = linkdiskpath
                        dsrc = self.fetch_url(iurl)
                        newbaseurl = dsrc.newurl
                        if len(dsrc) == 0 or \
                           len(re.compile('<!--.*?-->', re.DOTALL).sub('', dsrc).strip()) == 0:
                            raise ValueError('No content at URL %r'%iurl)
                        if callable(self.encoding):
                            dsrc = self.encoding(dsrc)
                        elif self.encoding is not None:
                            dsrc = dsrc.decode(self.encoding, 'replace')
                        else:
                            dsrc = xml_to_unicode(dsrc, self.verbose)[0]

                        soup = self.get_soup(dsrc, url=iurl)

                        base = soup.find('base', href=True)
                        if base is not None:
                            newbaseurl = base['href']
                        self.log.debug('Processing images...')
                        self.process_images(soup, newbaseurl)
                        if self.download_stylesheets:
                            self.process_stylesheets(soup, newbaseurl)

                        _fname = basename(iurl)
                        if not isinstance(_fname, unicode):
                            _fname.decode('latin1', 'replace')
                        _fname = _fname.encode('ascii', 'replace').replace('%', '').replace(os.sep, '')
                        _fname = ascii_filename(_fname)
                        _fname = os.path.splitext(_fname)[0][:120] + '.xhtml'
                        res = os.path.join(linkdiskpath, _fname)
                        self.downloaded_paths.append(res)
                        self.filemap[nurl] = res
                        if recursion_level < self.max_recursions:
                            self.log.debug('Processing links...')
                            self.process_links(soup, newbaseurl, recursion_level+1)
                        else:
                            self.process_return_links(soup, newbaseurl)
                            self.log.debug('Recursion limit reached. Skipping links in', iurl)

                        if callable(self.postprocess_html_ext):
                            soup = self.postprocess_html_ext(soup,
                                    c==0 and recursion_level==0 and not getattr(self, 'called_first', False),
                                    self.job_info)

                            if c==0 and recursion_level == 0:
                                self.called_first = True

                        save_soup(soup, res)
                        self.localize_link(tag, 'href', res)
                    except Exception:
                        self.failed_links.append((iurl, traceback.format_exc()))
                        self.log.exception('Could not fetch link', iurl)
                    finally:
                        self.current_dir = diskpath
                        self.files += 1
            finally:
                self.current_dir = prev_dir
            if self.show_progress:
                print
            return res

 class BasicUserRecipe1399883053(AutomaticNewsRecipe):
    title          = u'habr'
    oldest_article = 100
    max_articles_per_feed = 100
    auto_cleanup = False
    remove_javascript = True
    encoding = 'utf8'
    simultaneous_downloads = 1
    timeout = 300
    remove_tags =    []
    keep_only_tags = [dict(name='h2',  attrs={'class':'title'}),
                      dict(name='h1',  attrs={'class':'title'}),
                      dict(name='span',attrs={'class':'post_title'}),
                      dict(name='div', attrs={'class':'content html_format'}),
                      dict(name='div', attrs={'id':'comments'}),
                      dict(name='div', attrs={'class':'comments_list '}),
                      dict(name='div', attrs={'class':'comments_list'}),
                      dict(name='div', attrs={'class':'message html_format '})
                      ]

    feeds          = [(u'habr', u'http://habrahabr.ru/rss/hubs/')]

    def parse_feeds (self):
      feeds = BasicNewsRecipe.parse_feeds(self)
      for feed in feeds:
         for article in feed.articles:
            print 'article.title is: ', article.title
            print 'url: ', article.url
      return feeds


    def preprocess_html(self, soup):
      print 'PREPROCESS_HTML'
      #print(soup.get_text())

      #for t in soup.findAll('div'):
       #print t.name
       #print t
      #print (soup.prettify())
      return soup

    def get_article(self, candidates, best_candidate):
        print 'GET_ARTICLE'
        ret = self.get_article(self, candidates, best_candidate)
        return ret

    #def extract_readable_article(self, html, url):
    #    print 'EXTRACT_HTML'
    #    article = extract_readable_article(self, html, url)
    #    return article



    def preprocess_raw_html_(self, raw_html, url):
        print 'PREPROCESS_RAW_HTML'
        #print raw_html
        raw_html = self.preprocess_raw_html(raw_html, url)
        if self.auto_cleanup:
            try:
                raw_html = self.extract_readable_article(raw_html, url)
            except:
                self.log.exception('Auto_cleanup_of_URL: %r failed'%url)
        return raw_html


    def _fetch_article(self, url, dir_, f, a, num_of_feeds):
        #from calibre.web.fetch.simple import RecursiveFetcher

        print 'FETCH_ARTICLE!'
        br = self.browser
        if self.get_browser.im_func is BasicNewsRecipe.get_browser.im_func:
            # We are using the default get_browser, which means no need to
            # clone
            br = BasicNewsRecipe.get_browser(self)
        else:
            br = self.clone_browser(self.browser)
        self.web2disk_options.browser = br



        i = 0
        for i in range (0, 1):  # 5-10
           i = i + 1
           fetcher = RecursiveFetcher(self.web2disk_options, self.log,
                self.image_map, self.css_map,
                (url, f, a, num_of_feeds))
           fetcher.browser = br
           fetcher.base_dir = dir_
           fetcher.current_dir = dir_
           fetcher.show_progress = False
           fetcher.image_url_processor = self.image_url_processor

           res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
           print 'FETCH_PATH:', path
           print 'RES:', res
           print 'FAIL:', failures

           #print 'dir_:', dir_, ' br:', br

           #valid = False
           #ctypes.windll.user32.MessageBoxA(0, "pause", "Your title", 1)

           #f = open(res)
           #lines = f.readlines()
           #f.close()

           #for line in lines:
           #        print 'LINE:', line
           #        if '<html>' in line:
           #            valid = True
           #            break
           #        else:
           #            continue
           #if valid:
           #   print 'HTML_VALID:', i
              #ctypes.windll.user32.MessageBoxA(0, "pause", "Your title", 1)
           #   break
           #else:
           #   print 'HTML_INVALID:', i
              #ctypes.windll.user32.MessageBoxA(0, "pause INVALID", "Your title", 1)
           #   time.sleep(10) # delays for 5 seconds
           #   os.remove(res)
           #   continue


        if not res or not os.path.exists(res):
            msg = _('Could not fetch article.') + ' '
            if self.debug:
                msg += _('The debug traceback is available earlier in this log')
            else:
                msg += _('Run with -vv to see the reason')
            raise Exception(msg)

        return res, path, failures












    '''
    def _fetch_article(self, url, dird, f, a, numoffeeds):
        print 'FETCH_ARTICLE'
        res, path, failures = BasicNewsRecipe._fetch_article(self, url, dird, f, a, numoffeeds)
        print 'FETCH_PATH:', path
        print 'FETCH_RES:',  res
        print 'FETCH_FAIL:',  failures
        return res, path, failures
    '''



    '''
    def parse_feeds(self):
        from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
        from calibre import (browser, __appname__, iswindows, force_unicode, strftime, preferred_encoding, as_unicode)
        from contextlib import nested, closing
        print 'PARSE_FEEDS'

        #Create a list of articles from the list of feeds returned by :meth:`BasicNewsRecipe.get_feeds`.
        #Return a list of :class:`Feed` objects.
        feeds = self.get_feeds()
        parsed_feeds = []
        for obj in feeds:
            if isinstance(obj, basestring):
                title, url = None, obj
            else:
                title, url = obj
            if url.startswith('feed://'):
                url = 'http'+url[4:]
            self.report_progress(0, _('Fetching feed')+' %s...'%(title if title else url))
            try:
                with closing(self.browser.open(url)) as f:
                    parsed_feeds.append(feed_from_xml(f.read(),
                                          title=title,
                                          log=self.log,
                                          oldest_article=self.oldest_article,
                                          max_articles_per_feed=self.max_articles_per_feed,
                                          get_article_url=self.get_article_url))
                    if (self.delay > 0):
                        time.sleep(self.delay)
            except Exception as err:
                feed = Feed()
                msg = 'Failed_feed: %s'%(title if title else url)
                feed.populate_from_preparsed_feed(msg, [])
                feed.description = as_unicode(err)
                parsed_feeds.append(feed)
                self.log.exception(msg)

        remove = [fl for fl in parsed_feeds if len(fl) == 0 and
                self.remove_empty_feeds]
        for f in remove:
            parsed_feeds.remove(f)

        for feed in parsed_feeds:
            for article in feed.articles[:]:
              print 'article.title is: ', article.title
              print 'url: ', article.url
              print article.summary

        return parsed_feeds
    '''

    '''
    def extract_readable_article(self, html, url):
        print 'EXTRACT_HTML'
        #print html

        #Extracts main article content from 'html', cleans up and returns as a (article_html, extracted_title) tuple.
        #Based on the original readability algorithm by Arc90.

        from calibre.ebooks.readability import readability
        from lxml.html import (fragment_fromstring, tostring,
                document_fromstring)

        doc = readability.Document(html, self.log, url=url,
                keep_elements=self.auto_cleanup_keep)
        article_html = doc.summary()
        #print 'article_html =', article_html

        extracted_title = doc.title()
        print 'doc.title =', extracted_title

        try:
            frag = fragment_fromstring(article_html)
        except:
            doc = document_fromstring(article_html)
            frag = doc.xpath('//body')[-1]
        if frag.tag == 'html':
            root = frag
        elif frag.tag == 'body':
            root = document_fromstring(
                u'<html><head><title>%s</title></head></html>' %
                extracted_title)
            root.append(frag)
        else:
            root = document_fromstring(
                u'<html><head><title>%s</title></head><body/></html>' %
                extracted_title)
            root.xpath('//body')[0].append(frag)

        print 'frag.tag =', frag.tag

        body = root.xpath('//body')[0]
        has_title = False
        for x in body.iterdescendants():
            if x.text == extracted_title:
                has_title = True
        inline_titles = body.xpath('//h1|//h2')
        if not has_title and not inline_titles:
            heading = body.makeelement('h2')
            heading.text = extracted_title
            body.insert(0, heading)

        raw_html = tostring(root, encoding=unicode)

        return raw_html
    '''