marcelcaraciolo · October 23, 2013 01:35
diff --git a/coursetalk.py b/coursetalk.py
 import urllib2
 import urllib
 from BeautifulSoup import BeautifulSoup
 import cookielib
 import requests
 import re

 URL_BASE = 'http://coursetalk.org/'
 cursos = None
 ratings = None
 users = None
 tags = None
 professors = None
 client = None

 stars2ratings={'stars s10': '5.0', 'stars s9': '4.5', 'stars s8': '4.0',
 				'stars s7': '3.5', 'stars s6': '3.0', 'stars s5': '2.5',
 				'stars s4': '2.0', 'stars s3': '1.5', 'stars s2': '1.0',
 				'stars s1': '0.5', 'stars s0': '0.0'}

 def session_login():
 	global client
 	import sys
 	import requests

 	URL = 'http://coursetalk.org/login'

 	client = requests.session()

 	# Retrieve the CSRF token first
 	client.get(URL)  # sets cookie
 	csrftoken = client.cookies['csrftoken']

 	login_data = dict(email='[email protected]', password='marcelpc', csrfmiddlewaretoken=csrftoken, next='/')
 	client.post(URL, data=login_data, headers=dict(Referer=URL))
 	
 	return client

 from unicodedata import normalize
 def remover_acentos(txt):
 	if txt:
 		return normalize('NFKD', txt).encode('ASCII','ignore')
 	else:
 		return ''

 def create_files():
 	global cursos, ratings, users, tags, professors
 	cursos = open('cursos.dat', 'w')
 	ratings = open('ratings.dat', 'w')
 	users = open('users.dat', 'w')
 	tags = open('course-tags.dat', 'w')
 	professors = open('course-professors.dat', 'w')


 def crawl_description(url):
 	html_data = urllib2.urlopen(url).read()
 	html_parsed = BeautifulSoup(html_data)
 	description =  html_parsed.find('p')
 	return description.contents[0]
 	
 	
 def crawl_inside_course(url):
 	html_data = urllib2.urlopen(url).read()
 	html_parsed = BeautifulSoup(html_data)
 	data = {}
 	box =  html_parsed.find('div', {'class': 'course_box'})
 	professors = box.find('h5')
 	professores = []
 	for professor in professors.contents[0].split(','):
 		professores.append(professor.replace('&mdash;', '').replace('&nbsp;', '').strip())
 	#print professores
 	data['professors'] = professores
 	try:
 		rating = box.find('div', {'class': 'score'}).contents[0]
 		data['rating'] = rating
 	except AttributeError:
 		data['rating'] = ''
 	#print rating
 	difficulty = box.find('table', {'class': 'course_details'}).findAll('tr')[1].findAll('td')[1].contents[0]
 	data['difficulty'] = difficulty
 	#print difficulty
 	workload = box.find('table', {'class': 'course_details'}).findAll('tr')[2].findAll('td')[1].contents[0]
 	data['workload'] = workload
 	try:
 		workload = re.search(r'<(.*?)>(.*?)</(.*?)>', str(workload)).group(2)
 		data['workload'] = unicode(workload)
 	except AttributeError:
 		pass

 	#print workload
 	topics_l = box.find('table', {'class': 'course_details'}).findAll('a', {'class': 'tag'})
 	topics = []
 	for topic in topics_l:
 		topics.append(topic.contents[0])
 	#print topics	
 	data['topics'] = topics
 	data['description'] = crawl_description(URL_BASE + html_parsed.find('iframe')['src'])
 	#print description
 	print data
 	return data

 def crawl_course(url):
 	print url
 	html_data = urllib2.urlopen(url).read()
 	html_parsed = BeautifulSoup(html_data)
 	table  = html_parsed.find('table', {'class': 'table course_list'})
 	for course in table.findAll('tr')[1:]:
 		c = {}
 		if course.findAll('td')[0].find('a'):
 			c['provider'] = course.findAll('td')[0].find('a')['href'].replace('/', '')
 		else:
 			c['provider'] = None

 		c['slug'] = course.findAll('td')[1].find('a')['href']
 		c['name'] = course.findAll('td')[1].find('a').contents[0]
 		if len(course.findAll('td')[1].findAll('a')) > 1:
 			c['university'] = course.findAll('td')[1].findAll('a')[1].contents[0]
 		else:
 			c['university'] = None
 		data = crawl_inside_course(URL_BASE + c['slug'])
 		c['difficulty'] = data['difficulty']
 		c['rating'] = data['rating']
 		c['topics'] = data['topics']
 		c['description'] = data['description']
 		c['professors'] = data['professors']
 		c['workload'] = data['workload']

 		yield c

 def crawl_reviews(url, client):
 	total = 0
 	pg = 1
 	raw_html = client.get(url + '/?page=1').text
 	html_parsed = BeautifulSoup(raw_html)
 	try:
 		pages = html_parsed.find('div', {'class':'pagination pagination-centered'}).findAll('a')[1].contents[0]
 		pages = int(re.search(r'of ([\d]+)', pages).group(1))
 	except AttributeError:
 		if 'No reviews yet' in raw_html:
 			pages = 0
 		else:
 			print 'somente 1 pagina'
 			pages = 1

 	autores = []
 	ratings = []
 	while pg <= pages:
 		for review in html_parsed.findAll('tr', {"class": 'review-tr'}):
 			try:
 				author = review.find('a')['href'].replace('/u/', '')
 			except KeyError:
 				author = 'anonymous'
 		
 			stars = review.find('div', {'class': re.compile('stars')})
 			autores.append(author)
 			ratings.append(stars2ratings[stars['class']])
 			total +=1
 		pg+=1	
 		html_parsed = BeautifulSoup(client.get(url + '/?page=%d' % pg).text)

 	print 'total crawleado de reviews', total
 	return autores, ratings

 def crawl_courses(client):
 	total = 0
 	id_curso = 1
 	id_user = 1
 	users_set = {}
 	for pg in range(1, 113):
 		url = URL_BASE + '?page=%d' % pg
 		for course in crawl_course(url):
 			print course
 			cursos.write('%d|%s|%s|%s|%s|%s|%s\n' % (id_curso, remover_acentos(course['name']), course['rating'], remover_acentos(course['workload']),
 			 	remover_acentos(course['university']), course['difficulty'], course['provider']))
 			for tag in course['topics']:
 				tags.write('%d|%s\n' % (id_curso, tag))
 			for professor in course['professors']:
 				professors.write('%d|%s\n' % (id_curso, remover_acentos(professor)))
 				
 			autores, rt = crawl_reviews(URL_BASE + course['slug'], client)
 			for autor, nota, curso in zip(autores, rt, [id_curso] * len(autores)):					
 				if autor not in users_set:
 					users_set[autor] = id_user
 					id_user +=1
 				users.write('%d|%s\n' % (users_set[autor], autor))
 				ratings.write('%d|%d|%s\n' % (users_set[autor], curso, nota))
 				if autor == 'anonymous':
 					del users_set[autor]
 			id_curso +=1			
 			
 				
 		total +=1		
 	print 'total de paginas importados: ', total
 	print 'total de cursos importados ', id_curso
 	cursos.close()
 	users.close()
 	ratings.close()
 	professors.close()
 	tags.close()

 if __name__ == '__main__':
 		global client
 		create_files()
 		client = session_login()
 		crawl_courses(client)
 		#crawl_inside_course(URL_BASE + '/coursera/pre-calculus')
	import urllib2
	import urllib
	from BeautifulSoup import BeautifulSoup
	import cookielib
	import requests
	import re

	URL_BASE = 'http://coursetalk.org/'
	cursos = None
	ratings = None
	users = None
	tags = None
	professors = None
	client = None

	stars2ratings={'stars s10': '5.0', 'stars s9': '4.5', 'stars s8': '4.0',
	'stars s7': '3.5', 'stars s6': '3.0', 'stars s5': '2.5',
	'stars s4': '2.0', 'stars s3': '1.5', 'stars s2': '1.0',
	'stars s1': '0.5', 'stars s0': '0.0'}

	def session_login():
	global client
	import sys
	import requests

	URL = 'http://coursetalk.org/login'

	client = requests.session()

	# Retrieve the CSRF token first
	client.get(URL) # sets cookie
	csrftoken = client.cookies['csrftoken']

	login_data = dict(email='[email protected]', password='marcelpc', csrfmiddlewaretoken=csrftoken, next='/')
	client.post(URL, data=login_data, headers=dict(Referer=URL))

	return client

	from unicodedata import normalize
	def remover_acentos(txt):
	if txt:
	return normalize('NFKD', txt).encode('ASCII','ignore')
	else:
	return ''

	def create_files():
	global cursos, ratings, users, tags, professors
	cursos = open('cursos.dat', 'w')
	ratings = open('ratings.dat', 'w')
	users = open('users.dat', 'w')
	tags = open('course-tags.dat', 'w')
	professors = open('course-professors.dat', 'w')


	def crawl_description(url):
	html_data = urllib2.urlopen(url).read()
	html_parsed = BeautifulSoup(html_data)
	description = html_parsed.find('p')
	return description.contents[0]


	def crawl_inside_course(url):
	html_data = urllib2.urlopen(url).read()
	html_parsed = BeautifulSoup(html_data)
	data = {}
	box = html_parsed.find('div', {'class': 'course_box'})
	professors = box.find('h5')
	professores = []
	for professor in professors.contents[0].split(','):
	professores.append(professor.replace('—', '').replace(' ', '').strip())
	#print professores
	data['professors'] = professores
	try:
	rating = box.find('div', {'class': 'score'}).contents[0]
	data['rating'] = rating
	except AttributeError:
	data['rating'] = ''
	#print rating
	difficulty = box.find('table', {'class': 'course_details'}).findAll('tr')[1].findAll('td')[1].contents[0]
	data['difficulty'] = difficulty
	#print difficulty
	workload = box.find('table', {'class': 'course_details'}).findAll('tr')[2].findAll('td')[1].contents[0]
	data['workload'] = workload
	try:
	workload = re.search(r'<(.?)>(.?)</(.*?)>', str(workload)).group(2)
	data['workload'] = unicode(workload)
	except AttributeError:
	pass

	#print workload
	topics_l = box.find('table', {'class': 'course_details'}).findAll('a', {'class': 'tag'})
	topics = []
	for topic in topics_l:
	topics.append(topic.contents[0])
	#print topics
	data['topics'] = topics
	data['description'] = crawl_description(URL_BASE + html_parsed.find('iframe')['src'])
	#print description
	print data
	return data

	def crawl_course(url):
	print url
	html_data = urllib2.urlopen(url).read()
	html_parsed = BeautifulSoup(html_data)
	table = html_parsed.find('table', {'class': 'table course_list'})
	for course in table.findAll('tr')[1:]:
	c = {}
	if course.findAll('td')[0].find('a'):
	c['provider'] = course.findAll('td')[0].find('a')['href'].replace('/', '')
	else:
	c['provider'] = None

	c['slug'] = course.findAll('td')[1].find('a')['href']
	c['name'] = course.findAll('td')[1].find('a').contents[0]
	if len(course.findAll('td')[1].findAll('a')) > 1:
	c['university'] = course.findAll('td')[1].findAll('a')[1].contents[0]
	else:
	c['university'] = None
	data = crawl_inside_course(URL_BASE + c['slug'])
	c['difficulty'] = data['difficulty']
	c['rating'] = data['rating']
	c['topics'] = data['topics']
	c['description'] = data['description']
	c['professors'] = data['professors']
	c['workload'] = data['workload']

	yield c

	def crawl_reviews(url, client):
	total = 0
	pg = 1
	raw_html = client.get(url + '/?page=1').text
	html_parsed = BeautifulSoup(raw_html)
	try:
	pages = html_parsed.find('div', {'class':'pagination pagination-centered'}).findAll('a')[1].contents[0]
	pages = int(re.search(r'of ([\d]+)', pages).group(1))
	except AttributeError:
	if 'No reviews yet' in raw_html:
	pages = 0
	else:
	print 'somente 1 pagina'
	pages = 1

	autores = []
	ratings = []
	while pg <= pages:
	for review in html_parsed.findAll('tr', {"class": 'review-tr'}):
	try:
	author = review.find('a')['href'].replace('/u/', '')
	except KeyError:
	author = 'anonymous'

	stars = review.find('div', {'class': re.compile('stars')})
	autores.append(author)
	ratings.append(stars2ratings[stars['class']])
	total +=1
	pg+=1
	html_parsed = BeautifulSoup(client.get(url + '/?page=%d' % pg).text)

	print 'total crawleado de reviews', total
	return autores, ratings

	def crawl_courses(client):
	total = 0
	id_curso = 1
	id_user = 1
	users_set = {}
	for pg in range(1, 113):
	url = URL_BASE + '?page=%d' % pg
	for course in crawl_course(url):
	print course
	cursos.write('%d\|%s\|%s\|%s\|%s\|%s\|%s\n' % (id_curso, remover_acentos(course['name']), course['rating'], remover_acentos(course['workload']),
	remover_acentos(course['university']), course['difficulty'], course['provider']))
	for tag in course['topics']:
	tags.write('%d\|%s\n' % (id_curso, tag))
	for professor in course['professors']:
	professors.write('%d\|%s\n' % (id_curso, remover_acentos(professor)))

	autores, rt = crawl_reviews(URL_BASE + course['slug'], client)
	for autor, nota, curso in zip(autores, rt, [id_curso] * len(autores)):
	if autor not in users_set:
	users_set[autor] = id_user
	id_user +=1
	users.write('%d\|%s\n' % (users_set[autor], autor))
	ratings.write('%d\|%d\|%s\n' % (users_set[autor], curso, nota))
	if autor == 'anonymous':
	del users_set[autor]
	id_curso +=1


	total +=1
	print 'total de paginas importados: ', total
	print 'total de cursos importados ', id_curso
	cursos.close()
	users.close()
	ratings.close()
	professors.close()
	tags.close()

	if __name__ == '__main__':
	global client
	create_files()
	client = session_login()
	crawl_courses(client)
	#crawl_inside_course(URL_BASE + '/coursera/pre-calculus')