mvtango · November 25, 2016 19:04
diff --git a/.gitignore b/.gitignore
 data-directory/
diff --git a/README.md b/README.md
diff --git a/mailmirror.py b/mailmirror.py
 #! /usr/bin/python
 # coding: utf-8

 # In[9]:

 import requests
 import lxml
 from lxml import etree
 from lxml import html
 import os
 import logging
 import sys
 import md5
 import re
 import datetime 

 if "debug" in sys.argv :
 	logging.basicConfig(level=logging.DEBUG,stream=sys.stderr)
 else :
 	logging.basicConfig(filename="/dev/null")


 logger=logging.getLogger("")


 def download_file(url,directory="./",callback=False) :
    """ puts file into directory. processes HTML with callback if present.
        chooses unique filename. the filename is returned 
 	will not work for HTML>1024kb """
    local_filename = url.split('/')[-1]
    if not os.path.exists(directory) :
        os.makedirs(directory)
    dfile=os.path.join(directory,local_filename) 
    v=0
    while os.path.exists(dfile) :
        v=v+1
        ds=os.path.splitext(local_filename)
        dfile=os.path.join(directory, "%s.%s%s" % (ds[0],v,ds[1]))
    logger.debug("Getting %s -> %s" % (url,dfile))
    r = requests.get(url,stream=True,verify=False) # verify=False hat ngertz 'rausgefunden!
    with open(dfile, 'wb') as f :
        for chunk in r.iter_content(chunk_size=1024 * 1024): 
            if chunk: # filter out keep-alive new chunks
                if r.headers["content-type"].find("html")>-1 and callable(callback) :
                    tree=html.fromstring(chunk)
                    chunk=etree.tostring(callback(tree,directory=directory),method="html")
                f.write(chunk)
        f.close()
    return dfile



 # In[10]:

 def detrack(url) :
    r=requests.get(url,allow_redirects=False,verify=False) # verify siehe oben
    if r.status_code>299 and r.status_code<400 :
        return r.headers["location"]
    else :
        return url

 def makedate(match) :
 	g=match.groups()
 	if datetime.datetime.now().hour<9 :
 		td=(datetime.datetime.strptime(g[1],"%Y-%m-%d")-datetime.timedelta(days=1)).strftime("%Y-%m-%d")
 	else :
 		td=g[1]
 	return g[0]+td


 empty=["body/table/tr/td/table/tr/td/table/tbody/tr[1]", # Darstellungs-Hinweis
       "body/table/tr/td/table/tr/td/table/tbody/tr[6]", # EMail des Empfängers
       "body/table/tr/td/table/tr/td/table/tbody/tr[5]", # "Newsletter vom xxxx-xx-xx" 
      ]

 replace=[("head/title","Versicherungsmonitor:","Newsletter:"),
         # ("body/table/tr/td/table/tr/td/table/tbody/tr[5]",re.compile(r"(Newsletter vom )(\d\d\d\d-\d\d-\d\d)"),makedate),
 		]



 def process(tree,directory="./") :
    for e in empty :
 		ee=tree.findall(e)
 		c=0
 		tt=[]
 		if ee :
 			for rre in ee : 
 				tt.append(etree.tostring(rre))
 				rre.getparent().remove(rre)
 				c=c+1
 			logger.debug("%s - %s removed: %s " % (e,c,",".join([repr(a) for a in tt])))
    for (xp,sr,rp) in replace :
 		ee=tree.findall(xp)
 		if ee :
 			for reg in ee :
 				if hasattr(sr,"sub") :
 					t=sr.sub(rp,repr(reg.text_content()))
 				else :
 					t=reg.text_content().replace(sr,rp)
 				if (t!=reg.text_content()) :
 					logging.debug("Replaced %s -> %s in %s" % (reg.text_content(),t,xp))
 					reg.text=t
    for a in tree.cssselect("img") :
 	fn=re.sub("#.*$","",a.attrib["src"])	
        filename=os.path.split(download_file(fn,directory=directory))[1]
        a.attrib["data-original-src"]=a.attrib["src"]
        a.attrib["src"]=filename
    for a in tree.cssselect("a[href]") :
        ou=a.attrib["href"]
        if ou.find("http")==0 :
            d=detrack(a.attrib["href"])
            if d != a.attrib["href"] :
                logger.debug("Detracked %s -> %s" % (a.attrib["href"],d))
                a.attrib["data-orgiginal-href"]=a.attrib["href"]
 		try :
                	a.attrib["href"]=d
 		except Exception, e :
 			a.attrib["href"]="Error %s" % repr(e)
 		a.attrib["target"]="_blank"
    return tree


 def run(url,directory) :
    v_directory=directory % { "hash" : md5.md5(url).hexdigest() }
    if v_directory != directory :
        if os.path.exists(v_directory) :
            print "directory %s exists. %s not downloaded" % (v_directory,url)
            sys.exit(255)
        else :
            directory=v_directory
    print download_file(url
             ,directory=directory
             ,callback=process)


 if __name__=="__main__" :
 	if len(sys.argv)>1 :
    		run(sys.argv[1],sys.argv[2])
 		logging.debug("%s copied." % sys.argv[1])
 	else :
 		print """ %s URL DIRECTORY [debug]

 HTML Sanitizing for Email Newsletters

 --- copies URL and all referenced <img> file into one directory, changes <img src> attribute
 --- checks all <a href> to see if they produce a 301 and change the href attribute accordingly
 --- old attribute values are preserved in data-original-* attributes 
 --- certain unnecesary HTML elements, whose XPATHs are listed in the array called empty, are removed
 --- Text is edited according to the replace array, which lists triples of (xpath,search,replace)
 --- 
 --- If DIRECTORY contains the replacement string %(hash)s, this part of the DIRECTORY will be 
 --- replaced by a MD5 hash of the URL, and the program will exit if this DIRECTORY already exists.
 --- 

 debug - if present - will lead to copious output. 

 ToDo

 Copy CSS as well? Look for images in CSS?
 Allow "overwriting" of changed files when downloading the same URL twice

 """ % sys.argv[0]

diff --git a/test.sh b/test.sh
 #! /bin/bash
 rm -rf data-directory/*
 ./mailmirror.py http://tools.emailsys.net/mailing/101/472828/2109456/6vd9l7/index.html data-directory debug
	#! /usr/bin/python
	# coding: utf-8

	# In[9]:

	import requests
	import lxml
	from lxml import etree
	from lxml import html
	import os
	import logging
	import sys
	import md5
	import re
	import datetime

	if "debug" in sys.argv :
	logging.basicConfig(level=logging.DEBUG,stream=sys.stderr)
	else :
	logging.basicConfig(filename="/dev/null")


	logger=logging.getLogger("")


	def download_file(url,directory="./",callback=False) :
	""" puts file into directory. processes HTML with callback if present.
	chooses unique filename. the filename is returned
	will not work for HTML>1024kb """
	local_filename = url.split('/')[-1]
	if not os.path.exists(directory) :
	os.makedirs(directory)
	dfile=os.path.join(directory,local_filename)
	v=0
	while os.path.exists(dfile) :
	v=v+1
	ds=os.path.splitext(local_filename)
	dfile=os.path.join(directory, "%s.%s%s" % (ds[0],v,ds[1]))
	logger.debug("Getting %s -> %s" % (url,dfile))
	r = requests.get(url,stream=True,verify=False) # verify=False hat ngertz 'rausgefunden!
	with open(dfile, 'wb') as f :
	for chunk in r.iter_content(chunk_size=1024 * 1024):
	if chunk: # filter out keep-alive new chunks
	if r.headers["content-type"].find("html")>-1 and callable(callback) :
	tree=html.fromstring(chunk)
	chunk=etree.tostring(callback(tree,directory=directory),method="html")
	f.write(chunk)
	f.close()
	return dfile



	# In[10]:

	def detrack(url) :
	r=requests.get(url,allow_redirects=False,verify=False) # verify siehe oben
	if r.status_code>299 and r.status_code<400 :
	return r.headers["location"]
	else :
	return url

	def makedate(match) :
	g=match.groups()
	if datetime.datetime.now().hour<9 :
	td=(datetime.datetime.strptime(g[1],"%Y-%m-%d")-datetime.timedelta(days=1)).strftime("%Y-%m-%d")
	else :
	td=g[1]
	return g[0]+td


	empty=["body/table/tr/td/table/tr/td/table/tbody/tr[1]", # Darstellungs-Hinweis
	"body/table/tr/td/table/tr/td/table/tbody/tr[6]", # EMail des Empfängers
	"body/table/tr/td/table/tr/td/table/tbody/tr[5]", # "Newsletter vom xxxx-xx-xx"
	]

	replace=[("head/title","Versicherungsmonitor:","Newsletter:"),
	# ("body/table/tr/td/table/tr/td/table/tbody/tr[5]",re.compile(r"(Newsletter vom )(\d\d\d\d-\d\d-\d\d)"),makedate),
	]



	def process(tree,directory="./") :
	for e in empty :
	ee=tree.findall(e)
	c=0
	tt=[]
	if ee :
	for rre in ee :
	tt.append(etree.tostring(rre))
	rre.getparent().remove(rre)
	c=c+1
	logger.debug("%s - %s removed: %s " % (e,c,",".join([repr(a) for a in tt])))
	for (xp,sr,rp) in replace :
	ee=tree.findall(xp)
	if ee :
	for reg in ee :
	if hasattr(sr,"sub") :
	t=sr.sub(rp,repr(reg.text_content()))
	else :
	t=reg.text_content().replace(sr,rp)
	if (t!=reg.text_content()) :
	logging.debug("Replaced %s -> %s in %s" % (reg.text_content(),t,xp))
	reg.text=t
	for a in tree.cssselect("img") :
	fn=re.sub("#.*$","",a.attrib["src"])
	filename=os.path.split(download_file(fn,directory=directory))[1]
	a.attrib["data-original-src"]=a.attrib["src"]
	a.attrib["src"]=filename
	for a in tree.cssselect("a[href]") :
	ou=a.attrib["href"]
	if ou.find("http")==0 :
	d=detrack(a.attrib["href"])
	if d != a.attrib["href"] :
	logger.debug("Detracked %s -> %s" % (a.attrib["href"],d))
	a.attrib["data-orgiginal-href"]=a.attrib["href"]
	try :
	a.attrib["href"]=d
	except Exception, e :
	a.attrib["href"]="Error %s" % repr(e)
	a.attrib["target"]="_blank"
	return tree


	def run(url,directory) :
	v_directory=directory % { "hash" : md5.md5(url).hexdigest() }
	if v_directory != directory :
	if os.path.exists(v_directory) :
	print "directory %s exists. %s not downloaded" % (v_directory,url)
	sys.exit(255)
	else :
	directory=v_directory
	print download_file(url
	,directory=directory
	,callback=process)


	if __name__=="__main__" :
	if len(sys.argv)>1 :
	run(sys.argv[1],sys.argv[2])
	logging.debug("%s copied." % sys.argv[1])
	else :
	print """ %s URL DIRECTORY [debug]

	HTML Sanitizing for Email Newsletters

	--- copies URL and all referenced <img> file into one directory, changes <img src> attribute
	--- checks all <a href> to see if they produce a 301 and change the href attribute accordingly
	--- old attribute values are preserved in data-original-* attributes
	--- certain unnecesary HTML elements, whose XPATHs are listed in the array called empty, are removed
	--- Text is edited according to the replace array, which lists triples of (xpath,search,replace)
	---
	--- If DIRECTORY contains the replacement string %(hash)s, this part of the DIRECTORY will be
	--- replaced by a MD5 hash of the URL, and the program will exit if this DIRECTORY already exists.
	---

	debug - if present - will lead to copious output.

	ToDo

	Copy CSS as well? Look for images in CSS?
	Allow "overwriting" of changed files when downloading the same URL twice

	""" % sys.argv[0]
	#! /bin/bash
	rm -rf data-directory/*
	./mailmirror.py http://tools.emailsys.net/mailing/101/472828/2109456/6vd9l7/index.html data-directory debug