elidickinson · May 18, 2011 20:53
diff --git a/scrapbio.py b/scrapbio.py
 # First: wget --random-wait --wait=2 --limit-rate=100K http://e.mybio.zerista.com/exhibitor?exhibitor_page={1..142}
 # But that saves them with a stupid name, so rename using:
 # ls -d exhib* | sed 's/\(.*\)=\(.*\)$/mv "&" "bio\2.html"/' | sh
 #
 from BeautifulSoup import BeautifulSoup
 import csv
 import glob
 import os

 writer = csv.writer(open("bio.csv", "wb"))

 for infile in glob.glob('bio*.html'):
    soup = BeautifulSoup(open(infile))
    cells = soup.findAll('td','about')
    for cell in cells:
 	#print cell.name
 	exhibitor = cell.find('a').string.strip()
 	booth = cell.find('p','exhibitor_map_link')
 	tags = cell.find('span','tag_links')
 	if tags:
 	    taglinks = tags.findAll('a')
 	    tags = ", ".join([t.string.strip() for t in taglinks])
 	if booth:
 	    booth = booth.string.strip()
 	row = [exhibitor,booth,tags]
 	row = [unicode(x).encode('UTF-8','ignore') for x in row]
 	print row
 	writer.writerow(row)
	# First: wget --random-wait --wait=2 --limit-rate=100K http://e.mybio.zerista.com/exhibitor?exhibitor_page={1..142}
	# But that saves them with a stupid name, so rename using:
	# ls -d exhib* \| sed 's/\(.\)=\(.\)$/mv "&" "bio\2.html"/' \| sh
	#
	from BeautifulSoup import BeautifulSoup
	import csv
	import glob
	import os

	writer = csv.writer(open("bio.csv", "wb"))

	for infile in glob.glob('bio*.html'):
	soup = BeautifulSoup(open(infile))
	cells = soup.findAll('td','about')
	for cell in cells:
	#print cell.name
	exhibitor = cell.find('a').string.strip()
	booth = cell.find('p','exhibitor_map_link')
	tags = cell.find('span','tag_links')
	if tags:
	taglinks = tags.findAll('a')
	tags = ", ".join([t.string.strip() for t in taglinks])
	if booth:
	booth = booth.string.strip()
	row = [exhibitor,booth,tags]
	row = [unicode(x).encode('UTF-8','ignore') for x in row]
	print row
	writer.writerow(row)
No results found