|
#!/usr/bin/python |
|
|
|
import csv |
|
import re |
|
import urllib2 |
|
from bs4 import BeautifulSoup |
|
import argparse |
|
|
|
|
|
def main(): |
|
args = argument_parser() |
|
# Read in the file |
|
dat = get_snplist(args.snpfile[0], args.header, args.delim) |
|
header = dat[1] |
|
snplist = dat[0] |
|
# Extract the info from pd gene |
|
print "" |
|
print "Scraping www.pdgene.org for association results for "+str(len(snplist))+" SNPs from "+args.snpfile[0] |
|
print "Will write results to "+args.outfile |
|
print "" |
|
snplist = [extract_tabledata(snprow, args.column) for snprow in snplist] |
|
if args.header: |
|
dat = [header] + snplist |
|
else: |
|
dat = snplist |
|
# Write out to file |
|
write_snplist(dat, args.outfile) |
|
|
|
|
|
def argument_parser(): |
|
parser = argparse.ArgumentParser(description="Scrape the PD gene online database.") |
|
parser.add_argument("--snp-list", dest='snpfile', nargs=1, help="File with at least one column with SNP rs IDs", required=True) |
|
parser.add_argument("--column", dest='column', type=int, nargs=1, help="Which column is the SNP name", required=True) |
|
parser.add_argument("--header", dest="header", action='store_const', const=True, default=False, help="Use this flag if there is a header in the snp-list file") |
|
parser.add_argument("--delim", dest="delim", nargs=1, default=',', help="Character used to separate fields in snp-list-file. Defaults to ',' but will take any single character. If you want to use tab separator then specify using '\\t'") |
|
parser.add_argument("--out", dest='outfile', nargs=1, help="Output filename. Defaults to <snp-list-file>.pd") |
|
# Read in arguments |
|
args = parser.parse_args() |
|
args.outfile = args.outfile[0] if args.outfile else args.snpfile[0]+".pd" |
|
args.column = args.column[0] - 1 |
|
args.delim = args.delim[0].decode('string-escape') |
|
return args |
|
|
|
|
|
def get_snplist(filename, header, delim): |
|
with open(filename, 'rU') as csvfile: |
|
gwas = csv.reader(csvfile, delimiter=delim, quotechar='"') |
|
if header: |
|
headers = next(gwas, None) + ["PD.Alleles","PD.Effect","PD.Pval"] |
|
else: |
|
headers = [] |
|
data = [] |
|
for row in gwas: |
|
data.append(row) |
|
return [data, headers] |
|
|
|
|
|
def write_snplist(dat, filename): |
|
print "\nWriting results to "+filename |
|
with open(filename, "wb") as f: |
|
writer = csv.writer(f) |
|
writer.writerows(dat) |
|
|
|
|
|
def bs_preprocess(html): |
|
"""remove distracting whitespaces and newline characters""" |
|
pat = re.compile('(^[\s]+)|([\s]+$)', re.MULTILINE) |
|
html = re.sub(pat, '', html) # remove leading and trailing whitespaces |
|
html = re.sub('\n', ' ', html) # convert newlines to spaces |
|
# this preserves newline delimiters |
|
html = re.sub('[\s]+<', '<', html) # remove whitespaces before opening tags |
|
html = re.sub('>[\s]+', '>', html) # remove whitespaces after closing tags |
|
return html |
|
|
|
|
|
def get_headings(snp): |
|
url = 'http://www.pdgene.org/view?poly='+snp |
|
page = urllib2.urlopen(url).read() |
|
soup = BeautifulSoup(bs_preprocess(page)) |
|
table = soup.find("table", attrs={"class":"list"}) |
|
headings = [th.get_text() for th in table.find("tr").find_all("th")] |
|
return headings |
|
|
|
|
|
def extract_tabledata(snprow, column): |
|
print "Processing SNP "+snprow[column] |
|
url = 'http://www.pdgene.org/view?poly='+snprow[column] |
|
page = urllib2.urlopen(url).read() |
|
soup = BeautifulSoup(bs_preprocess(page)) |
|
table = soup.find("table", attrs={"class":"list"}) |
|
if table != None: |
|
for row in table.find_all("tr")[1:]: |
|
dataset = [td.get_text() for td in row.find_all("td")] |
|
dataset = [dataset[3], dataset[9], dataset[11]] |
|
else: |
|
dataset = ["NA", "NA", "NA"] |
|
snprow = snprow+dataset |
|
return snprow |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |