Created
October 23, 2013 01:35
-
-
Save marcelcaraciolo/7111066 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib2 | |
import urllib | |
from BeautifulSoup import BeautifulSoup | |
import cookielib | |
import requests | |
import re | |
URL_BASE = 'http://coursetalk.org/' | |
cursos = None | |
ratings = None | |
users = None | |
tags = None | |
professors = None | |
client = None | |
stars2ratings={'stars s10': '5.0', 'stars s9': '4.5', 'stars s8': '4.0', | |
'stars s7': '3.5', 'stars s6': '3.0', 'stars s5': '2.5', | |
'stars s4': '2.0', 'stars s3': '1.5', 'stars s2': '1.0', | |
'stars s1': '0.5', 'stars s0': '0.0'} | |
def session_login(): | |
global client | |
import sys | |
import requests | |
URL = 'http://coursetalk.org/login' | |
client = requests.session() | |
# Retrieve the CSRF token first | |
client.get(URL) # sets cookie | |
csrftoken = client.cookies['csrftoken'] | |
login_data = dict(email='[email protected]', password='marcelpc', csrfmiddlewaretoken=csrftoken, next='/') | |
client.post(URL, data=login_data, headers=dict(Referer=URL)) | |
return client | |
from unicodedata import normalize | |
def remover_acentos(txt): | |
if txt: | |
return normalize('NFKD', txt).encode('ASCII','ignore') | |
else: | |
return '' | |
def create_files(): | |
global cursos, ratings, users, tags, professors | |
cursos = open('cursos.dat', 'w') | |
ratings = open('ratings.dat', 'w') | |
users = open('users.dat', 'w') | |
tags = open('course-tags.dat', 'w') | |
professors = open('course-professors.dat', 'w') | |
def crawl_description(url): | |
html_data = urllib2.urlopen(url).read() | |
html_parsed = BeautifulSoup(html_data) | |
description = html_parsed.find('p') | |
return description.contents[0] | |
def crawl_inside_course(url): | |
html_data = urllib2.urlopen(url).read() | |
html_parsed = BeautifulSoup(html_data) | |
data = {} | |
box = html_parsed.find('div', {'class': 'course_box'}) | |
professors = box.find('h5') | |
professores = [] | |
for professor in professors.contents[0].split(','): | |
professores.append(professor.replace('—', '').replace(' ', '').strip()) | |
#print professores | |
data['professors'] = professores | |
try: | |
rating = box.find('div', {'class': 'score'}).contents[0] | |
data['rating'] = rating | |
except AttributeError: | |
data['rating'] = '' | |
#print rating | |
difficulty = box.find('table', {'class': 'course_details'}).findAll('tr')[1].findAll('td')[1].contents[0] | |
data['difficulty'] = difficulty | |
#print difficulty | |
workload = box.find('table', {'class': 'course_details'}).findAll('tr')[2].findAll('td')[1].contents[0] | |
data['workload'] = workload | |
try: | |
workload = re.search(r'<(.*?)>(.*?)</(.*?)>', str(workload)).group(2) | |
data['workload'] = unicode(workload) | |
except AttributeError: | |
pass | |
#print workload | |
topics_l = box.find('table', {'class': 'course_details'}).findAll('a', {'class': 'tag'}) | |
topics = [] | |
for topic in topics_l: | |
topics.append(topic.contents[0]) | |
#print topics | |
data['topics'] = topics | |
data['description'] = crawl_description(URL_BASE + html_parsed.find('iframe')['src']) | |
#print description | |
print data | |
return data | |
def crawl_course(url): | |
print url | |
html_data = urllib2.urlopen(url).read() | |
html_parsed = BeautifulSoup(html_data) | |
table = html_parsed.find('table', {'class': 'table course_list'}) | |
for course in table.findAll('tr')[1:]: | |
c = {} | |
if course.findAll('td')[0].find('a'): | |
c['provider'] = course.findAll('td')[0].find('a')['href'].replace('/', '') | |
else: | |
c['provider'] = None | |
c['slug'] = course.findAll('td')[1].find('a')['href'] | |
c['name'] = course.findAll('td')[1].find('a').contents[0] | |
if len(course.findAll('td')[1].findAll('a')) > 1: | |
c['university'] = course.findAll('td')[1].findAll('a')[1].contents[0] | |
else: | |
c['university'] = None | |
data = crawl_inside_course(URL_BASE + c['slug']) | |
c['difficulty'] = data['difficulty'] | |
c['rating'] = data['rating'] | |
c['topics'] = data['topics'] | |
c['description'] = data['description'] | |
c['professors'] = data['professors'] | |
c['workload'] = data['workload'] | |
yield c | |
def crawl_reviews(url, client): | |
total = 0 | |
pg = 1 | |
raw_html = client.get(url + '/?page=1').text | |
html_parsed = BeautifulSoup(raw_html) | |
try: | |
pages = html_parsed.find('div', {'class':'pagination pagination-centered'}).findAll('a')[1].contents[0] | |
pages = int(re.search(r'of ([\d]+)', pages).group(1)) | |
except AttributeError: | |
if 'No reviews yet' in raw_html: | |
pages = 0 | |
else: | |
print 'somente 1 pagina' | |
pages = 1 | |
autores = [] | |
ratings = [] | |
while pg <= pages: | |
for review in html_parsed.findAll('tr', {"class": 'review-tr'}): | |
try: | |
author = review.find('a')['href'].replace('/u/', '') | |
except KeyError: | |
author = 'anonymous' | |
stars = review.find('div', {'class': re.compile('stars')}) | |
autores.append(author) | |
ratings.append(stars2ratings[stars['class']]) | |
total +=1 | |
pg+=1 | |
html_parsed = BeautifulSoup(client.get(url + '/?page=%d' % pg).text) | |
print 'total crawleado de reviews', total | |
return autores, ratings | |
def crawl_courses(client): | |
total = 0 | |
id_curso = 1 | |
id_user = 1 | |
users_set = {} | |
for pg in range(1, 113): | |
url = URL_BASE + '?page=%d' % pg | |
for course in crawl_course(url): | |
print course | |
cursos.write('%d|%s|%s|%s|%s|%s|%s\n' % (id_curso, remover_acentos(course['name']), course['rating'], remover_acentos(course['workload']), | |
remover_acentos(course['university']), course['difficulty'], course['provider'])) | |
for tag in course['topics']: | |
tags.write('%d|%s\n' % (id_curso, tag)) | |
for professor in course['professors']: | |
professors.write('%d|%s\n' % (id_curso, remover_acentos(professor))) | |
autores, rt = crawl_reviews(URL_BASE + course['slug'], client) | |
for autor, nota, curso in zip(autores, rt, [id_curso] * len(autores)): | |
if autor not in users_set: | |
users_set[autor] = id_user | |
id_user +=1 | |
users.write('%d|%s\n' % (users_set[autor], autor)) | |
ratings.write('%d|%d|%s\n' % (users_set[autor], curso, nota)) | |
if autor == 'anonymous': | |
del users_set[autor] | |
id_curso +=1 | |
total +=1 | |
print 'total de paginas importados: ', total | |
print 'total de cursos importados ', id_curso | |
cursos.close() | |
users.close() | |
ratings.close() | |
professors.close() | |
tags.close() | |
if __name__ == '__main__': | |
global client | |
create_files() | |
client = session_login() | |
crawl_courses(client) | |
#crawl_inside_course(URL_BASE + '/coursera/pre-calculus') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment