Skip to content

Instantly share code, notes, and snippets.

@explodecomputer
Last active August 29, 2015 14:18
Show Gist options
  • Save explodecomputer/9a8d3fc9583d300b4244 to your computer and use it in GitHub Desktop.
Save explodecomputer/9a8d3fc9583d300b4244 to your computer and use it in GitHub Desktop.
Scrape PD gene website for GWAS associations for a given list of SNPs

Scrape PD gene website for GWAS associations for a given list of SNPs

Requirements

This requires Python v2.7 plus the following plugin:

Usage

It reads in a file that has one column specifying SNP names and any other columns you want (they won't be used by the script but will be printed in the output along with the results). You specify which column is the SNP column, and optionally also specify the delimiter used to separate fields and whether or not there is a header row. The default output file is <input-filename>.pd. For example, using the test.csv file you would run like this:

python2.7 scrape_pdgene.py --snp-list test.csv --column 3 --header --delim ',' --out results.csv

The --snp-list flag specifies the input file, the --column flag specifies which column is the SNP IDs, the --header flag specifies that the first line is a header (omit this flag for the default of no header line), the --delim flag specifies the character that separates the fields in the file, and the --out flag specifies where the results will be saved.

For more info run:

python2.7 scrape_pdgene.py --help

More info can be found

#!/usr/bin/python
import csv
import re
import urllib2
from bs4 import BeautifulSoup
import argparse
def main():
args = argument_parser()
# Read in the file
dat = get_snplist(args.snpfile[0], args.header, args.delim)
header = dat[1]
snplist = dat[0]
# Extract the info from pd gene
print ""
print "Scraping www.pdgene.org for association results for "+str(len(snplist))+" SNPs from "+args.snpfile[0]
print "Will write results to "+args.outfile
print ""
snplist = [extract_tabledata(snprow, args.column) for snprow in snplist]
if args.header:
dat = [header] + snplist
else:
dat = snplist
# Write out to file
write_snplist(dat, args.outfile)
def argument_parser():
parser = argparse.ArgumentParser(description="Scrape the PD gene online database.")
parser.add_argument("--snp-list", dest='snpfile', nargs=1, help="File with at least one column with SNP rs IDs", required=True)
parser.add_argument("--column", dest='column', type=int, nargs=1, help="Which column is the SNP name", required=True)
parser.add_argument("--header", dest="header", action='store_const', const=True, default=False, help="Use this flag if there is a header in the snp-list file")
parser.add_argument("--delim", dest="delim", nargs=1, default=',', help="Character used to separate fields in snp-list-file. Defaults to ',' but will take any single character. If you want to use tab separator then specify using '\\t'")
parser.add_argument("--out", dest='outfile', nargs=1, help="Output filename. Defaults to <snp-list-file>.pd")
# Read in arguments
args = parser.parse_args()
args.outfile = args.outfile[0] if args.outfile else args.snpfile[0]+".pd"
args.column = args.column[0] - 1
args.delim = args.delim[0].decode('string-escape')
return args
def get_snplist(filename, header, delim):
with open(filename, 'rU') as csvfile:
gwas = csv.reader(csvfile, delimiter=delim, quotechar='"')
if header:
headers = next(gwas, None) + ["PD.Alleles","PD.Effect","PD.Pval"]
else:
headers = []
data = []
for row in gwas:
data.append(row)
return [data, headers]
def write_snplist(dat, filename):
print "\nWriting results to "+filename
with open(filename, "wb") as f:
writer = csv.writer(f)
writer.writerows(dat)
def bs_preprocess(html):
"""remove distracting whitespaces and newline characters"""
pat = re.compile('(^[\s]+)|([\s]+$)', re.MULTILINE)
html = re.sub(pat, '', html) # remove leading and trailing whitespaces
html = re.sub('\n', ' ', html) # convert newlines to spaces
# this preserves newline delimiters
html = re.sub('[\s]+<', '<', html) # remove whitespaces before opening tags
html = re.sub('>[\s]+', '>', html) # remove whitespaces after closing tags
return html
def get_headings(snp):
url = 'http://www.pdgene.org/view?poly='+snp
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(bs_preprocess(page))
table = soup.find("table", attrs={"class":"list"})
headings = [th.get_text() for th in table.find("tr").find_all("th")]
return headings
def extract_tabledata(snprow, column):
print "Processing SNP "+snprow[column]
url = 'http://www.pdgene.org/view?poly='+snprow[column]
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(bs_preprocess(page))
table = soup.find("table", attrs={"class":"list"})
if table != None:
for row in table.find_all("tr")[1:]:
dataset = [td.get_text() for td in row.find_all("td")]
dataset = [dataset[3], dataset[9], dataset[11]]
else:
dataset = ["NA", "NA", "NA"]
snprow = snprow+dataset
return snprow
if __name__ == '__main__':
main()
Disease/Trait Strongest SNP-Risk Allele SNPs OR or beta Reported Gene(s)
Cholesterol: total rs1883025-T rs1883025 .07 ABCA1
Cholesterol: total rs2287623-G rs2287623 .03 ABCB11
Cholesterol: total rs4299376-G rs4299376 .08 ABCG5: ABCG8
Cholesterol: total rs9411489-T rs9411489 .07 ABO
Cholesterol: total rs2131925-G rs2131925 .08 ANGPTL3
Cholesterol: total rs964184-C rs964184 .12 APOA1
Cholesterol: total rs1367117-A rs1367117 .10 APOB
Cholesterol: total rs4420638-G rs4420638 .20 APOE
Cholesterol: total rs1077514-C rs1077514 .03 ASAP3
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment