gartenfeld · February 23, 2013 05:50
diff --git a/name-scraper.py b/name-scraper.py
 from bs4 import BeautifulSoup
 import re # Regular Expressions
 import collections # Data Types
 import sys # File operations
 import codecs # UniCode support

 def scrape(page):
  # Dump raw HTML into Soup
  raw_data = codecs.open(page, 'r', encoding='utf-8').read()
 	soup = BeautifulSoup(raw_data)

  # Build a list of cells containing the name
 	lines = [line.find('a') for line in soup.find_all('td', 'cell c1')] # Specifics vary

 	students = []

 	for link in lines:
 		line_soup = BeautifulSoup(str(link).encode('utf-8'))
 		student = line_soup.get_text()
 		if not re.match('[\d]', student): # Exclude zombie members with numbers in their names
 			students.append(student)
 	
 	return students

 if __name__ == '__main__':
 	page = '/file_dir/file_name.html'
 	print scrape(page)
	from bs4 import BeautifulSoup
	import re # Regular Expressions
	import collections # Data Types
	import sys # File operations
	import codecs # UniCode support

	def scrape(page):
	# Dump raw HTML into Soup
	raw_data = codecs.open(page, 'r', encoding='utf-8').read()
	soup = BeautifulSoup(raw_data)

	# Build a list of cells containing the name
	lines = [line.find('a') for line in soup.find_all('td', 'cell c1')] # Specifics vary

	students = []

	for link in lines:
	line_soup = BeautifulSoup(str(link).encode('utf-8'))
	student = line_soup.get_text()
	if not re.match('[\d]', student): # Exclude zombie members with numbers in their names
	students.append(student)

	return students

	if __name__ == '__main__':
	page = '/file_dir/file_name.html'
	print scrape(page)
No results found