-
-
Save boraseoksoon/e2e46bc4714bf9c2bfec44bcd27f3e06 to your computer and use it in GitHub Desktop.
This is a script for scraping the site Stack Overflow's user pages and returning relevant data from the html doc as a csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Stack Overflow scraper script | |
#imports necessary modules | |
from urllib2 import urlopen | |
from BeautifulSoup import BeautifulSoup | |
import time | |
username = raw_input("Username: ") | |
#defines url, retrieves it and turns it into a Beautiful Soup object | |
# string interpolation doesn't work at present since user no is hardcoded | |
url = 'http://stackoverflow.com/users/43089/%s' % username | |
html = urlopen(url).read() | |
soup = BeautifulSoup(html) | |
# fetches user details table | |
user_details = soup.find('table', {'class': 'user-details'}) | |
# defines username | |
name = soup.h1.text | |
# returns css tag containing join date. | |
# trying to use .replace to cut out dross returns "NoneType" error... | |
temp_date = user_details.find(text='member for').findNext('td').find('span',{'class' : 'cool'}) | |
# this removes the timestamp from the css tag and pops out the time of day | |
birth = temp_date['title'].split().pop(0) | |
# last seen data - temp var used to deliver result | |
temp_seen = user_details.find(text='seen').findNext('td').find('span',{'class' : 'cool'}) | |
seen = temp_seen.span['title'].split().pop(0) | |
# structured as time data | |
tm_seen = time.strptime(seen, "%Y-%m-%d") | |
tm_birth = time.strptime(birth, "%Y-%m-%d") | |
t1 = time.mktime(tm_seen) | |
t2 = time.mktime(tm_birth) | |
lifetime = (t1-t2)/86400 | |
# user location | |
location = user_details.find(text='location').findNext('td').string.strip() | |
# age | |
age = user_details.find(text='age').findNext('td').string.strip() | |
# reputation | |
reputation = soup.find('span', {'class': 'summarycount'}).text | |
# questions | |
questions = soup.find('span', {'class': 'summarycount ar'}).text | |
# answers | |
answers = soup.find('div', {'class': 'summarycount ar'}).text | |
# votes | |
votesCast = soup.find('table', {'class': 'votes-cast-stats'}) | |
upVotes = int(votesCast.td.text) | |
downVotes = int(votesCast.contents[5].contents[1].text) | |
totalVotes = upVotes + downVotes | |
# Some custom metrics | |
percentNice = (100/float(totalVotes))* upVotes | |
percentMean = (100/float(totalVotes))* downVotes | |
print 'name, birth, lastSeen, lifetime, age, location, reputation, questions, answers, totalVotes, upVotes, downVotes, percentNice, percentMean' | |
print name,',',birth,',',seen,',',lifetime,',',age,',',location,',',reputation,',',questions,',',answers,',',totalVotes,',',upVotes,',',downVotes,',',percentNice,',',percentMean |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment