alexpatel · August 29, 2015 14:05
diff --git a/harvard-club-scraper.py b/harvard-club-scraper.py
 #!/usr/bin/env python

 ''' Script to scrape contact information from Harvard Student Organizations site 
    (http://osl.fas.harvard.edu/student-organizations). 
    Run with 'python /path/to/scraper'. 
    
    -- License --

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 '''

 from bs4 import BeautifulSoup
 import requests
 import csv

 BASE_URL = "http://usodb.fas.harvard.edu/public/index.cgi"
 OUTPUT = "clubs.csv"

 def soupify(url):
    """ Download URL and turn its text into BS-parsable obj. """
    req = requests.get(url)
    data = req.text
    return BeautifulSoup(data)

 def get_contact_info(url):
    """ Gets all email addresses from a club page. """
    soup = soupify(url)
    contacts = []
    for link in soup.find_all('a'):
        href = link.get('href')
        if 'mailto' in href and len(href) > 7:
            contacts.append(href[7:].encode('ascii', 'ignore'))
    return contacts

 def get_clubs():
    """ Gets all clubs from main Clubs & Organizations page. """
    clubs = {}
    soup = soupify(BASE_URL)
    links = soup.find_all('a')
    for i,a  in enumerate(links):
        link = BASE_URL + a.get('href')  
        clubs.update(
            { a.string.encode('ascii', 'ignore'): 
                {
                'url' : link.encode('ascii', 'ignore'),  
                'contacts' : get_contact_info(link) 
                }
            }
        )
    return clubs 

 def write(data, file):
    """ Write data to CSV. """
    with open(file, "wb") as f:
        writer = csv.writer(f, delimiter=',')
        for line in data:
            writer.writerow(line)

 def format(clubs):
    data = [['Club', 'URL', 'Contact Information']]
    for club in clubs:
        row = [club, clubs[club]['url']]
        row.extend(clubs[club]['contacts'])
        data.append(row)
    return data

 def main():
    clubs = get_clubs()
    data = format(clubs) 
    write(data, OUTPUT)

 if __name__ == '__main__':
    main()
	#!/usr/bin/env python

	''' Script to scrape contact information from Harvard Student Organizations site
	(http://osl.fas.harvard.edu/student-organizations).
	Run with 'python /path/to/scraper'.

	-- License --

	This program is free software: you can redistribute it and/or modify
	it under the terms of the GNU General Public License as published by
	the Free Software Foundation, either version 3 of the License, or
	(at your option) any later version.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	GNU General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with this program. If not, see <http://www.gnu.org/licenses/>.
	'''

	from bs4 import BeautifulSoup
	import requests
	import csv

	BASE_URL = "http://usodb.fas.harvard.edu/public/index.cgi"
	OUTPUT = "clubs.csv"

	def soupify(url):
	""" Download URL and turn its text into BS-parsable obj. """
	req = requests.get(url)
	data = req.text
	return BeautifulSoup(data)

	def get_contact_info(url):
	""" Gets all email addresses from a club page. """
	soup = soupify(url)
	contacts = []
	for link in soup.find_all('a'):
	href = link.get('href')
	if 'mailto' in href and len(href) > 7:
	contacts.append(href[7:].encode('ascii', 'ignore'))
	return contacts

	def get_clubs():
	""" Gets all clubs from main Clubs & Organizations page. """
	clubs = {}
	soup = soupify(BASE_URL)
	links = soup.find_all('a')
	for i,a in enumerate(links):
	link = BASE_URL + a.get('href')
	clubs.update(
	{ a.string.encode('ascii', 'ignore'):
	{
	'url' : link.encode('ascii', 'ignore'),
	'contacts' : get_contact_info(link)
	}
	}
	)
	return clubs

	def write(data, file):
	""" Write data to CSV. """
	with open(file, "wb") as f:
	writer = csv.writer(f, delimiter=',')
	for line in data:
	writer.writerow(line)

	def format(clubs):
	data = [['Club', 'URL', 'Contact Information']]
	for club in clubs:
	row = [club, clubs[club]['url']]
	row.extend(clubs[club]['contacts'])
	data.append(row)
	return data

	def main():
	clubs = get_clubs()
	data = format(clubs)
	write(data, OUTPUT)

	if __name__ == '__main__':
	main()