Created
February 23, 2013 05:50
-
-
Save gartenfeld/5018624 to your computer and use it in GitHub Desktop.
Scraping names of participants in a class from a raw HTML dump.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import re # Regular Expressions | |
import collections # Data Types | |
import sys # File operations | |
import codecs # UniCode support | |
def scrape(page): | |
# Dump raw HTML into Soup | |
raw_data = codecs.open(page, 'r', encoding='utf-8').read() | |
soup = BeautifulSoup(raw_data) | |
# Build a list of cells containing the name | |
lines = [line.find('a') for line in soup.find_all('td', 'cell c1')] # Specifics vary | |
students = [] | |
for link in lines: | |
line_soup = BeautifulSoup(str(link).encode('utf-8')) | |
student = line_soup.get_text() | |
if not re.match('[\d]', student): # Exclude zombie members with numbers in their names | |
students.append(student) | |
return students | |
if __name__ == '__main__': | |
page = '/file_dir/file_name.html' | |
print scrape(page) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment