Skip to content

Instantly share code, notes, and snippets.

@soh-i
Created July 7, 2015 09:03
Show Gist options
  • Select an option

  • Save soh-i/733931eba91c0d37e375 to your computer and use it in GitHub Desktop.

Select an option

Save soh-i/733931eba91c0d37e375 to your computer and use it in GitHub Desktop.
Fetch all data from http://pax-db.org
import lxml.html
import re
import requests
import time
from ppi.utils import debug
def _to_sp(sp):
if sp == "sce":
return "S. cerevisiae (yeast)"
elif sp == "hsa":
return "H. sapiens"
def generate_dl_links_from_html(path, sp):
f = open(path, "r").read()
dom = lxml.html.fromstring(f)
sp_flag = False
base_url = "http://pax-db.org/"
results = []
for tag in dom.body.xpath("/html/body/div[3]/div[2]/div/div/div/div[2]/div[1]/div/table/tbody/tr"):
td = tag.xpath("td")
if len(td) == 2 and sp_flag:
href = td[1].xpath("a")[0].get("href")
dl_url = "%s%s" % (base_url, href)
results.append(dl_url)
elif len(td) == 3:
species_name = td[0].xpath("div")[0].text
if species_name == _to_sp(sp):
sp_flag = True
else:
sp_flag = False
if sp_flag:
href = td[2].xpath("a")[0].get("href")
dl_url = "%s%s" % (base_url, href)
results.append(dl_url)
return results
def get_paxdb_data(urls):
for url in urls:
r = requests.get(url)
if r.status_code == 200:
filename = url.split("/")[-1]
debug(filename)
with open("%s.tsv" % (filename), "w") as out:
out.write(r.text)
time.sleep(2)
if __name__ == "__main__":
sce_links = generate_dl_links_from_html("index.html", "hsa")
get_paxdb_data(sce_links)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment