Last active
August 29, 2015 14:08
-
-
Save shazeline/62ca864f32a701a190b6 to your computer and use it in GitHub Desktop.
scrapes old scribe notes off of eggert's 111 site
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import requests | |
import urllib | |
from collections import defaultdict | |
from bs4 import BeautifulSoup | |
def parse_table(table, base_url): | |
data = defaultdict(list) | |
count = 1 | |
for row in table.find_all('tr'): | |
row_data = row.find_all(re.compile('t[dh]')) | |
for i, col in enumerate(row_data): | |
if not 'scribe' in str(col): | |
row_data[i] = ' '.join(col.stripped_strings) | |
else: | |
links = [x.get('href') for x in col.find_all('a')] | |
row_data[i] = links | |
if len(row_data) == 5 and row_data[3][0] in map(str, xrange(10)): | |
lec = row_data[3].lower() | |
lec = '_'.join(lec.split()) | |
lec = lec.replace(';', '') | |
lec = lec.replace(',', '') | |
lec = lec.replace('.', '') | |
links = row_data[4] | |
for link in links: | |
data[lec].append('%s/%s' % (base_url, link)) | |
return data | |
def get_links(base_url): | |
page = requests.get('%s/%s' % (base_url, 'syllabus.html')) | |
soup = BeautifulSoup(page.text) | |
table = soup.find_all('table')[0] | |
data = parse_table(table, base_url) | |
return data | |
def process_file(link_id, base_url): | |
base_url = base_url.replace('index.html', '') | |
with open(link_id, 'rb') as f: | |
lines = [line.strip('\n') for line in f.readlines()] | |
new_file = [] | |
for line in lines: | |
if 'src="' in line and 'src="http' not in line: | |
toks = line.split('src="') | |
new_file.append(toks[0] + 'src="' + base_url + toks[1]) | |
else: | |
new_file.append(line) | |
with open(link_id, 'wb') as f: | |
for x in new_file: | |
f.write(x + '\n') | |
def get_scribe_dict(): | |
years = range(11,15) | |
quarters = ['winter', 'spring', 'fall'] | |
all_links = defaultdict(list) | |
for year in years: | |
for quarter in quarters: | |
base_url = 'http://cs.ucla.edu/classes/%s%s/cs111' % (quarter, year) | |
try: | |
x = get_links(base_url) | |
for a in x: | |
all_links[a] += x[a] | |
except: | |
pass | |
return all_links | |
def download_notes(all_links): | |
for lec in all_links: | |
print '=' * len(lec) | |
print lec | |
print '=' * len(lec) | |
directory = 'notes/%s' % lec | |
if not os.path.exists(directory): | |
os.makedirs(directory) | |
for link in all_links[lec]: | |
try: | |
toks = link.split('/') | |
link_id = '%s_%s.html' % (toks[4], toks[7]) | |
print link | |
loc = '%s/%s' % (directory, link_id) | |
urllib.URLopener().retrieve(link, loc) | |
process_file(loc, link) | |
except: | |
pass | |
download_notes(get_scribe_dict()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment