soh-i · July 7, 2015 09:03
diff --git a/fetch_paxdb.py b/fetch_paxdb.py
 import lxml.html
 import re
 import requests
 import time
 from ppi.utils import debug

 def _to_sp(sp):
    if sp == "sce":
        return "S. cerevisiae (yeast)"
    elif sp == "hsa":
        return "H. sapiens"

 def generate_dl_links_from_html(path, sp):
    f = open(path, "r").read()
    dom = lxml.html.fromstring(f)

    sp_flag = False
    base_url = "http://pax-db.org/"
    results = []
    for tag in dom.body.xpath("/html/body/div[3]/div[2]/div/div/div/div[2]/div[1]/div/table/tbody/tr"):
        td = tag.xpath("td")
        if len(td) == 2 and sp_flag:
            href = td[1].xpath("a")[0].get("href")
            dl_url = "%s%s" % (base_url, href)
            results.append(dl_url)
            
        elif len(td) == 3:
            species_name = td[0].xpath("div")[0].text
            if species_name == _to_sp(sp):
                sp_flag = True
            else:
                sp_flag = False

            if sp_flag:
                href = td[2].xpath("a")[0].get("href")
                dl_url = "%s%s" % (base_url, href)
                results.append(dl_url)
    return results

 def get_paxdb_data(urls):
    for url in urls:
        r = requests.get(url)
        if r.status_code == 200:
            filename = url.split("/")[-1]
            debug(filename)
            with open("%s.tsv" % (filename), "w") as out:
                out.write(r.text)
        time.sleep(2)

        
 if __name__ == "__main__":
    sce_links = generate_dl_links_from_html("index.html", "hsa")
    get_paxdb_data(sce_links)
	import lxml.html
	import re
	import requests
	import time
	from ppi.utils import debug

	def _to_sp(sp):
	if sp == "sce":
	return "S. cerevisiae (yeast)"
	elif sp == "hsa":
	return "H. sapiens"

	def generate_dl_links_from_html(path, sp):
	f = open(path, "r").read()
	dom = lxml.html.fromstring(f)

	sp_flag = False
	base_url = "http://pax-db.org/"
	results = []
	for tag in dom.body.xpath("/html/body/div[3]/div[2]/div/div/div/div[2]/div[1]/div/table/tbody/tr"):
	td = tag.xpath("td")
	if len(td) == 2 and sp_flag:
	href = td[1].xpath("a")[0].get("href")
	dl_url = "%s%s" % (base_url, href)
	results.append(dl_url)

	elif len(td) == 3:
	species_name = td[0].xpath("div")[0].text
	if species_name == _to_sp(sp):
	sp_flag = True
	else:
	sp_flag = False

	if sp_flag:
	href = td[2].xpath("a")[0].get("href")
	dl_url = "%s%s" % (base_url, href)
	results.append(dl_url)
	return results

	def get_paxdb_data(urls):
	for url in urls:
	r = requests.get(url)
	if r.status_code == 200:
	filename = url.split("/")[-1]
	debug(filename)
	with open("%s.tsv" % (filename), "w") as out:
	out.write(r.text)
	time.sleep(2)


	if __name__ == "__main__":
	sce_links = generate_dl_links_from_html("index.html", "hsa")
	get_paxdb_data(sce_links)
No results found