explodecomputer · August 29, 2015 14:18
diff --git a/README.md b/README.md
diff --git a/scrape_pdgene.py b/scrape_pdgene.py
 #!/usr/bin/python

 import csv
 import re
 import urllib2
 from bs4 import BeautifulSoup
 import argparse


 def main():
    args = argument_parser()
    # Read in the file
    dat = get_snplist(args.snpfile[0], args.header, args.delim)
    header = dat[1]
    snplist = dat[0]
    # Extract the info from pd gene
    print ""
    print "Scraping www.pdgene.org for association results for "+str(len(snplist))+" SNPs from "+args.snpfile[0]
    print "Will write results to "+args.outfile
    print ""
    snplist = [extract_tabledata(snprow, args.column) for snprow in snplist]
    if args.header:
        dat = [header] + snplist
    else:
        dat = snplist
    # Write out to file
    write_snplist(dat, args.outfile)


 def argument_parser():
    parser = argparse.ArgumentParser(description="Scrape the PD gene online database.")
    parser.add_argument("--snp-list", dest='snpfile', nargs=1, help="File with at least one column with SNP rs IDs", required=True)
    parser.add_argument("--column", dest='column', type=int, nargs=1, help="Which column is the SNP name", required=True)
    parser.add_argument("--header", dest="header", action='store_const', const=True, default=False, help="Use this flag if there is a header in the snp-list file")
    parser.add_argument("--delim", dest="delim", nargs=1, default=',', help="Character used to separate fields in snp-list-file. Defaults to ',' but will take any single character. If you want to use tab separator then specify using '\\t'")
    parser.add_argument("--out", dest='outfile', nargs=1, help="Output filename. Defaults to <snp-list-file>.pd")
    # Read in arguments
    args = parser.parse_args()
    args.outfile = args.outfile[0] if args.outfile else args.snpfile[0]+".pd"
    args.column = args.column[0] - 1
    args.delim = args.delim[0].decode('string-escape')
    return args


 def get_snplist(filename, header, delim):
    with open(filename, 'rU') as csvfile:
        gwas = csv.reader(csvfile, delimiter=delim, quotechar='"')
        if header:
            headers = next(gwas, None) + ["PD.Alleles","PD.Effect","PD.Pval"]
        else:
            headers = []
        data = []
        for row in gwas:
            data.append(row)
    return [data, headers]


 def write_snplist(dat, filename):
    print "\nWriting results to "+filename
    with open(filename, "wb") as f:
        writer = csv.writer(f)
        writer.writerows(dat)


 def bs_preprocess(html):
    """remove distracting whitespaces and newline characters"""
    pat = re.compile('(^[\s]+)|([\s]+$)', re.MULTILINE)
    html = re.sub(pat, '', html)       # remove leading and trailing whitespaces
    html = re.sub('\n', ' ', html)     # convert newlines to spaces
                                       # this preserves newline delimiters
    html = re.sub('[\s]+<', '<', html) # remove whitespaces before opening tags
    html = re.sub('>[\s]+', '>', html) # remove whitespaces after closing tags
    return html


 def get_headings(snp):
    url = 'http://www.pdgene.org/view?poly='+snp
    page = urllib2.urlopen(url).read()
    soup = BeautifulSoup(bs_preprocess(page))
    table = soup.find("table", attrs={"class":"list"})
    headings = [th.get_text() for th in table.find("tr").find_all("th")]
    return headings


 def extract_tabledata(snprow, column):
    print "Processing SNP "+snprow[column]
    url = 'http://www.pdgene.org/view?poly='+snprow[column]
    page = urllib2.urlopen(url).read()
    soup = BeautifulSoup(bs_preprocess(page))
    table = soup.find("table", attrs={"class":"list"})
    if table != None:
        for row in table.find_all("tr")[1:]:
            dataset = [td.get_text() for td in row.find_all("td")]
            dataset = [dataset[3], dataset[9], dataset[11]]
    else:
        dataset = ["NA", "NA", "NA"]
    snprow = snprow+dataset
    return snprow


 if __name__ == '__main__':
    main()
diff --git a/test.csv b/test.csv
	#!/usr/bin/python

	import csv
	import re
	import urllib2
	from bs4 import BeautifulSoup
	import argparse


	def main():
	args = argument_parser()
	# Read in the file
	dat = get_snplist(args.snpfile[0], args.header, args.delim)
	header = dat[1]
	snplist = dat[0]
	# Extract the info from pd gene
	print ""
	print "Scraping www.pdgene.org for association results for "+str(len(snplist))+" SNPs from "+args.snpfile[0]
	print "Will write results to "+args.outfile
	print ""
	snplist = [extract_tabledata(snprow, args.column) for snprow in snplist]
	if args.header:
	dat = [header] + snplist
	else:
	dat = snplist
	# Write out to file
	write_snplist(dat, args.outfile)


	def argument_parser():
	parser = argparse.ArgumentParser(description="Scrape the PD gene online database.")
	parser.add_argument("--snp-list", dest='snpfile', nargs=1, help="File with at least one column with SNP rs IDs", required=True)
	parser.add_argument("--column", dest='column', type=int, nargs=1, help="Which column is the SNP name", required=True)
	parser.add_argument("--header", dest="header", action='store_const', const=True, default=False, help="Use this flag if there is a header in the snp-list file")
	parser.add_argument("--delim", dest="delim", nargs=1, default=',', help="Character used to separate fields in snp-list-file. Defaults to ',' but will take any single character. If you want to use tab separator then specify using '\\t'")
	parser.add_argument("--out", dest='outfile', nargs=1, help="Output filename. Defaults to <snp-list-file>.pd")
	# Read in arguments
	args = parser.parse_args()
	args.outfile = args.outfile[0] if args.outfile else args.snpfile[0]+".pd"
	args.column = args.column[0] - 1
	args.delim = args.delim[0].decode('string-escape')
	return args


	def get_snplist(filename, header, delim):
	with open(filename, 'rU') as csvfile:
	gwas = csv.reader(csvfile, delimiter=delim, quotechar='"')
	if header:
	headers = next(gwas, None) + ["PD.Alleles","PD.Effect","PD.Pval"]
	else:
	headers = []
	data = []
	for row in gwas:
	data.append(row)
	return [data, headers]


	def write_snplist(dat, filename):
	print "\nWriting results to "+filename
	with open(filename, "wb") as f:
	writer = csv.writer(f)
	writer.writerows(dat)


	def bs_preprocess(html):
	"""remove distracting whitespaces and newline characters"""
	pat = re.compile('(^[\s]+)\|([\s]+$)', re.MULTILINE)
	html = re.sub(pat, '', html) # remove leading and trailing whitespaces
	html = re.sub('\n', ' ', html) # convert newlines to spaces
	# this preserves newline delimiters
	html = re.sub('[\s]+<', '<', html) # remove whitespaces before opening tags
	html = re.sub('>[\s]+', '>', html) # remove whitespaces after closing tags
	return html


	def get_headings(snp):
	url = 'http://www.pdgene.org/view?poly='+snp
	page = urllib2.urlopen(url).read()
	soup = BeautifulSoup(bs_preprocess(page))
	table = soup.find("table", attrs={"class":"list"})
	headings = [th.get_text() for th in table.find("tr").find_all("th")]
	return headings


	def extract_tabledata(snprow, column):
	print "Processing SNP "+snprow[column]
	url = 'http://www.pdgene.org/view?poly='+snprow[column]
	page = urllib2.urlopen(url).read()
	soup = BeautifulSoup(bs_preprocess(page))
	table = soup.find("table", attrs={"class":"list"})
	if table != None:
	for row in table.find_all("tr")[1:]:
	dataset = [td.get_text() for td in row.find_all("td")]
	dataset = [dataset[3], dataset[9], dataset[11]]
	else:
	dataset = ["NA", "NA", "NA"]
	snprow = snprow+dataset
	return snprow


	if __name__ == '__main__':
	main()
Disease/Trait	Strongest SNP-Risk Allele	SNPs	OR or beta	Reported Gene(s)
Cholesterol: total	rs1883025-T	rs1883025	.07	ABCA1
Cholesterol: total	rs2287623-G	rs2287623	.03	ABCB11
Cholesterol: total	rs4299376-G	rs4299376	.08	ABCG5: ABCG8
Cholesterol: total	rs9411489-T	rs9411489	.07	ABO
Cholesterol: total	rs2131925-G	rs2131925	.08	ANGPTL3
Cholesterol: total	rs964184-C	rs964184	.12	APOA1
Cholesterol: total	rs1367117-A	rs1367117	.10	APOB
Cholesterol: total	rs4420638-G	rs4420638	.20	APOE
Cholesterol: total	rs1077514-C	rs1077514	.03	ASAP3