shazeline · August 29, 2015 14:08
diff --git a/scribe_spider.py b/scribe_spider.py
 import os
 import re
 import requests
 import urllib
 from collections import defaultdict

 from bs4 import BeautifulSoup

 def parse_table(table, base_url):
  data = defaultdict(list)
  count = 1
  for row in table.find_all('tr'):
    row_data = row.find_all(re.compile('t[dh]'))
    for i, col in enumerate(row_data):
      if not 'scribe' in str(col):
        row_data[i] = ' '.join(col.stripped_strings)
      else:
        links = [x.get('href') for x in col.find_all('a')]
        row_data[i] = links
    if len(row_data) == 5 and row_data[3][0] in map(str, xrange(10)):
      lec = row_data[3].lower()
      lec = '_'.join(lec.split())
      lec = lec.replace(';', '')
      lec = lec.replace(',', '')
      lec = lec.replace('.', '')
      links = row_data[4]
      for link in links:
        data[lec].append('%s/%s' % (base_url, link))
  return data

 def get_links(base_url):
  page = requests.get('%s/%s' % (base_url, 'syllabus.html'))
  soup = BeautifulSoup(page.text)
  table = soup.find_all('table')[0]
  data = parse_table(table, base_url)
  return data

 def process_file(link_id, base_url):
  base_url = base_url.replace('index.html', '')
  with open(link_id, 'rb') as f:
    lines = [line.strip('\n') for line in f.readlines()]
    new_file = []
    for line in lines:
      if 'src="' in line and 'src="http' not in line:
        toks = line.split('src="')
        new_file.append(toks[0] + 'src="' + base_url + toks[1])
      else:
        new_file.append(line)
    with open(link_id, 'wb') as f:
      for x in new_file:
        f.write(x + '\n')

 def get_scribe_dict():
  years = range(11,15)
  quarters = ['winter', 'spring', 'fall']
  all_links = defaultdict(list)
  for year in years:
    for quarter in quarters:
      base_url = 'http://cs.ucla.edu/classes/%s%s/cs111' % (quarter, year)
      try:
        x = get_links(base_url)
        for a in x:
          all_links[a] += x[a]
      except:
        pass
  return all_links

 def download_notes(all_links):
  for lec in all_links:
    print '=' * len(lec)
    print lec
    print '=' * len(lec)
    directory = 'notes/%s' % lec
    if not os.path.exists(directory):
      os.makedirs(directory)
    for link in all_links[lec]:
      try:
        toks = link.split('/')
        link_id = '%s_%s.html' % (toks[4], toks[7])
        print link
        loc = '%s/%s' % (directory, link_id)
        urllib.URLopener().retrieve(link, loc)
        process_file(loc, link)
      except:
        pass

 download_notes(get_scribe_dict())
	import os
	import re
	import requests
	import urllib
	from collections import defaultdict

	from bs4 import BeautifulSoup

	def parse_table(table, base_url):
	data = defaultdict(list)
	count = 1
	for row in table.find_all('tr'):
	row_data = row.find_all(re.compile('t[dh]'))
	for i, col in enumerate(row_data):
	if not 'scribe' in str(col):
	row_data[i] = ' '.join(col.stripped_strings)
	else:
	links = [x.get('href') for x in col.find_all('a')]
	row_data[i] = links
	if len(row_data) == 5 and row_data[3][0] in map(str, xrange(10)):
	lec = row_data[3].lower()
	lec = '_'.join(lec.split())
	lec = lec.replace(';', '')
	lec = lec.replace(',', '')
	lec = lec.replace('.', '')
	links = row_data[4]
	for link in links:
	data[lec].append('%s/%s' % (base_url, link))
	return data

	def get_links(base_url):
	page = requests.get('%s/%s' % (base_url, 'syllabus.html'))
	soup = BeautifulSoup(page.text)
	table = soup.find_all('table')[0]
	data = parse_table(table, base_url)
	return data

	def process_file(link_id, base_url):
	base_url = base_url.replace('index.html', '')
	with open(link_id, 'rb') as f:
	lines = [line.strip('\n') for line in f.readlines()]
	new_file = []
	for line in lines:
	if 'src="' in line and 'src="http' not in line:
	toks = line.split('src="')
	new_file.append(toks[0] + 'src="' + base_url + toks[1])
	else:
	new_file.append(line)
	with open(link_id, 'wb') as f:
	for x in new_file:
	f.write(x + '\n')

	def get_scribe_dict():
	years = range(11,15)
	quarters = ['winter', 'spring', 'fall']
	all_links = defaultdict(list)
	for year in years:
	for quarter in quarters:
	base_url = 'http://cs.ucla.edu/classes/%s%s/cs111' % (quarter, year)
	try:
	x = get_links(base_url)
	for a in x:
	all_links[a] += x[a]
	except:
	pass
	return all_links

	def download_notes(all_links):
	for lec in all_links:
	print '=' * len(lec)
	print lec
	print '=' * len(lec)
	directory = 'notes/%s' % lec
	if not os.path.exists(directory):
	os.makedirs(directory)
	for link in all_links[lec]:
	try:
	toks = link.split('/')
	link_id = '%s_%s.html' % (toks[4], toks[7])
	print link
	loc = '%s/%s' % (directory, link_id)
	urllib.URLopener().retrieve(link, loc)
	process_file(loc, link)
	except:
	pass

	download_notes(get_scribe_dict())