Skip to content

Instantly share code, notes, and snippets.

@shazeline
Last active August 29, 2015 14:08
Show Gist options
  • Save shazeline/62ca864f32a701a190b6 to your computer and use it in GitHub Desktop.
Save shazeline/62ca864f32a701a190b6 to your computer and use it in GitHub Desktop.
scrapes old scribe notes off of eggert's 111 site
import os
import re
import requests
import urllib
from collections import defaultdict
from bs4 import BeautifulSoup
def parse_table(table, base_url):
data = defaultdict(list)
count = 1
for row in table.find_all('tr'):
row_data = row.find_all(re.compile('t[dh]'))
for i, col in enumerate(row_data):
if not 'scribe' in str(col):
row_data[i] = ' '.join(col.stripped_strings)
else:
links = [x.get('href') for x in col.find_all('a')]
row_data[i] = links
if len(row_data) == 5 and row_data[3][0] in map(str, xrange(10)):
lec = row_data[3].lower()
lec = '_'.join(lec.split())
lec = lec.replace(';', '')
lec = lec.replace(',', '')
lec = lec.replace('.', '')
links = row_data[4]
for link in links:
data[lec].append('%s/%s' % (base_url, link))
return data
def get_links(base_url):
page = requests.get('%s/%s' % (base_url, 'syllabus.html'))
soup = BeautifulSoup(page.text)
table = soup.find_all('table')[0]
data = parse_table(table, base_url)
return data
def process_file(link_id, base_url):
base_url = base_url.replace('index.html', '')
with open(link_id, 'rb') as f:
lines = [line.strip('\n') for line in f.readlines()]
new_file = []
for line in lines:
if 'src="' in line and 'src="http' not in line:
toks = line.split('src="')
new_file.append(toks[0] + 'src="' + base_url + toks[1])
else:
new_file.append(line)
with open(link_id, 'wb') as f:
for x in new_file:
f.write(x + '\n')
def get_scribe_dict():
years = range(11,15)
quarters = ['winter', 'spring', 'fall']
all_links = defaultdict(list)
for year in years:
for quarter in quarters:
base_url = 'http://cs.ucla.edu/classes/%s%s/cs111' % (quarter, year)
try:
x = get_links(base_url)
for a in x:
all_links[a] += x[a]
except:
pass
return all_links
def download_notes(all_links):
for lec in all_links:
print '=' * len(lec)
print lec
print '=' * len(lec)
directory = 'notes/%s' % lec
if not os.path.exists(directory):
os.makedirs(directory)
for link in all_links[lec]:
try:
toks = link.split('/')
link_id = '%s_%s.html' % (toks[4], toks[7])
print link
loc = '%s/%s' % (directory, link_id)
urllib.URLopener().retrieve(link, loc)
process_file(loc, link)
except:
pass
download_notes(get_scribe_dict())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment