|  | #! /usr/bin/python | 
        
          |  | # coding: utf-8 | 
        
          |  |  | 
        
          |  | # In[9]: | 
        
          |  |  | 
        
          |  | import requests | 
        
          |  | import lxml | 
        
          |  | from lxml import etree | 
        
          |  | from lxml import html | 
        
          |  | import os | 
        
          |  | import logging | 
        
          |  | import sys | 
        
          |  | import md5 | 
        
          |  | import re | 
        
          |  | import datetime | 
        
          |  |  | 
        
          |  | if "debug" in sys.argv : | 
        
          |  | logging.basicConfig(level=logging.DEBUG,stream=sys.stderr) | 
        
          |  | else : | 
        
          |  | logging.basicConfig(filename="/dev/null") | 
        
          |  |  | 
        
          |  |  | 
        
          |  | logger=logging.getLogger("") | 
        
          |  |  | 
        
          |  |  | 
        
          |  | def download_file(url,directory="./",callback=False) : | 
        
          |  | """ puts file into directory. processes HTML with callback if present. | 
        
          |  | chooses unique filename. the filename is returned | 
        
          |  | will not work for HTML>1024kb """ | 
        
          |  | local_filename = url.split('/')[-1] | 
        
          |  | if not os.path.exists(directory) : | 
        
          |  | os.makedirs(directory) | 
        
          |  | dfile=os.path.join(directory,local_filename) | 
        
          |  | v=0 | 
        
          |  | while os.path.exists(dfile) : | 
        
          |  | v=v+1 | 
        
          |  | ds=os.path.splitext(local_filename) | 
        
          |  | dfile=os.path.join(directory, "%s.%s%s" % (ds[0],v,ds[1])) | 
        
          |  | logger.debug("Getting %s -> %s" % (url,dfile)) | 
        
          |  | r = requests.get(url,stream=True,verify=False) # verify=False hat ngertz 'rausgefunden! | 
        
          |  | with open(dfile, 'wb') as f : | 
        
          |  | for chunk in r.iter_content(chunk_size=1024 * 1024): | 
        
          |  | if chunk: # filter out keep-alive new chunks | 
        
          |  | if r.headers["content-type"].find("html")>-1 and callable(callback) : | 
        
          |  | tree=html.fromstring(chunk) | 
        
          |  | chunk=etree.tostring(callback(tree,directory=directory),method="html") | 
        
          |  | f.write(chunk) | 
        
          |  | f.close() | 
        
          |  | return dfile | 
        
          |  |  | 
        
          |  |  | 
        
          |  |  | 
        
          |  | # In[10]: | 
        
          |  |  | 
        
          |  | def detrack(url) : | 
        
          |  | r=requests.get(url,allow_redirects=False,verify=False) # verify siehe oben | 
        
          |  | if r.status_code>299 and r.status_code<400 : | 
        
          |  | return r.headers["location"] | 
        
          |  | else : | 
        
          |  | return url | 
        
          |  |  | 
        
          |  | def makedate(match) : | 
        
          |  | g=match.groups() | 
        
          |  | if datetime.datetime.now().hour<9 : | 
        
          |  | td=(datetime.datetime.strptime(g[1],"%Y-%m-%d")-datetime.timedelta(days=1)).strftime("%Y-%m-%d") | 
        
          |  | else : | 
        
          |  | td=g[1] | 
        
          |  | return g[0]+td | 
        
          |  |  | 
        
          |  |  | 
        
          |  | empty=["body/table/tr/td/table/tr/td/table/tbody/tr[1]", # Darstellungs-Hinweis | 
        
          |  | "body/table/tr/td/table/tr/td/table/tbody/tr[6]", # EMail des Empfängers | 
        
          |  | "body/table/tr/td/table/tr/td/table/tbody/tr[5]", # "Newsletter vom xxxx-xx-xx" | 
        
          |  | ] | 
        
          |  |  | 
        
          |  | replace=[("head/title","Versicherungsmonitor:","Newsletter:"), | 
        
          |  | # ("body/table/tr/td/table/tr/td/table/tbody/tr[5]",re.compile(r"(Newsletter vom )(\d\d\d\d-\d\d-\d\d)"),makedate), | 
        
          |  | ] | 
        
          |  |  | 
        
          |  |  | 
        
          |  |  | 
        
          |  | def process(tree,directory="./") : | 
        
          |  | for e in empty : | 
        
          |  | ee=tree.findall(e) | 
        
          |  | c=0 | 
        
          |  | tt=[] | 
        
          |  | if ee : | 
        
          |  | for rre in ee : | 
        
          |  | tt.append(etree.tostring(rre)) | 
        
          |  | rre.getparent().remove(rre) | 
        
          |  | c=c+1 | 
        
          |  | logger.debug("%s - %s removed: %s " % (e,c,",".join([repr(a) for a in tt]))) | 
        
          |  | for (xp,sr,rp) in replace : | 
        
          |  | ee=tree.findall(xp) | 
        
          |  | if ee : | 
        
          |  | for reg in ee : | 
        
          |  | if hasattr(sr,"sub") : | 
        
          |  | t=sr.sub(rp,repr(reg.text_content())) | 
        
          |  | else : | 
        
          |  | t=reg.text_content().replace(sr,rp) | 
        
          |  | if (t!=reg.text_content()) : | 
        
          |  | logging.debug("Replaced %s -> %s in %s" % (reg.text_content(),t,xp)) | 
        
          |  | reg.text=t | 
        
          |  | for a in tree.cssselect("img") : | 
        
          |  | fn=re.sub("#.*$","",a.attrib["src"]) | 
        
          |  | filename=os.path.split(download_file(fn,directory=directory))[1] | 
        
          |  | a.attrib["data-original-src"]=a.attrib["src"] | 
        
          |  | a.attrib["src"]=filename | 
        
          |  | for a in tree.cssselect("a[href]") : | 
        
          |  | ou=a.attrib["href"] | 
        
          |  | if ou.find("http")==0 : | 
        
          |  | d=detrack(a.attrib["href"]) | 
        
          |  | if d != a.attrib["href"] : | 
        
          |  | logger.debug("Detracked %s -> %s" % (a.attrib["href"],d)) | 
        
          |  | a.attrib["data-orgiginal-href"]=a.attrib["href"] | 
        
          |  | try : | 
        
          |  | a.attrib["href"]=d | 
        
          |  | except Exception, e : | 
        
          |  | a.attrib["href"]="Error %s" % repr(e) | 
        
          |  | a.attrib["target"]="_blank" | 
        
          |  | return tree | 
        
          |  |  | 
        
          |  |  | 
        
          |  | def run(url,directory) : | 
        
          |  | v_directory=directory % { "hash" : md5.md5(url).hexdigest() } | 
        
          |  | if v_directory != directory : | 
        
          |  | if os.path.exists(v_directory) : | 
        
          |  | print "directory %s exists. %s not downloaded" % (v_directory,url) | 
        
          |  | sys.exit(255) | 
        
          |  | else : | 
        
          |  | directory=v_directory | 
        
          |  | print download_file(url | 
        
          |  | ,directory=directory | 
        
          |  | ,callback=process) | 
        
          |  |  | 
        
          |  |  | 
        
          |  | if __name__=="__main__" : | 
        
          |  | if len(sys.argv)>1 : | 
        
          |  | run(sys.argv[1],sys.argv[2]) | 
        
          |  | logging.debug("%s copied." % sys.argv[1]) | 
        
          |  | else : | 
        
          |  | print """ %s URL DIRECTORY [debug] | 
        
          |  |  | 
        
          |  | HTML Sanitizing for Email Newsletters | 
        
          |  |  | 
        
          |  | --- copies URL and all referenced <img> file into one directory, changes <img src> attribute | 
        
          |  | --- checks all <a href> to see if they produce a 301 and change the href attribute accordingly | 
        
          |  | --- old attribute values are preserved in data-original-* attributes | 
        
          |  | --- certain unnecesary HTML elements, whose XPATHs are listed in the array called empty, are removed | 
        
          |  | --- Text is edited according to the replace array, which lists triples of (xpath,search,replace) | 
        
          |  | --- | 
        
          |  | --- If DIRECTORY contains the replacement string %(hash)s, this part of the DIRECTORY will be | 
        
          |  | --- replaced by a MD5 hash of the URL, and the program will exit if this DIRECTORY already exists. | 
        
          |  | --- | 
        
          |  |  | 
        
          |  | debug - if present - will lead to copious output. | 
        
          |  |  | 
        
          |  | ToDo | 
        
          |  |  | 
        
          |  | Copy CSS as well? Look for images in CSS? | 
        
          |  | Allow "overwriting" of changed files when downloading the same URL twice | 
        
          |  |  | 
        
          |  | """ % sys.argv[0] | 
        
          |  |  |