Skip to content

Instantly share code, notes, and snippets.

@boraseoksoon
Forked from ronan-mch/StackOverflow.py
Created November 19, 2020 13:54
Show Gist options
  • Save boraseoksoon/e2e46bc4714bf9c2bfec44bcd27f3e06 to your computer and use it in GitHub Desktop.
Save boraseoksoon/e2e46bc4714bf9c2bfec44bcd27f3e06 to your computer and use it in GitHub Desktop.
This is a script for scraping the site Stack Overflow's user pages and returning relevant data from the html doc as a csv
#Stack Overflow scraper script
#imports necessary modules
from urllib2 import urlopen
from BeautifulSoup import BeautifulSoup
import time
username = raw_input("Username: ")
#defines url, retrieves it and turns it into a Beautiful Soup object
# string interpolation doesn't work at present since user no is hardcoded
url = 'http://stackoverflow.com/users/43089/%s' % username
html = urlopen(url).read()
soup = BeautifulSoup(html)
# fetches user details table
user_details = soup.find('table', {'class': 'user-details'})
# defines username
name = soup.h1.text
# returns css tag containing join date.
# trying to use .replace to cut out dross returns "NoneType" error...
temp_date = user_details.find(text='member for').findNext('td').find('span',{'class' : 'cool'})
# this removes the timestamp from the css tag and pops out the time of day
birth = temp_date['title'].split().pop(0)
# last seen data - temp var used to deliver result
temp_seen = user_details.find(text='seen').findNext('td').find('span',{'class' : 'cool'})
seen = temp_seen.span['title'].split().pop(0)
# structured as time data
tm_seen = time.strptime(seen, "%Y-%m-%d")
tm_birth = time.strptime(birth, "%Y-%m-%d")
t1 = time.mktime(tm_seen)
t2 = time.mktime(tm_birth)
lifetime = (t1-t2)/86400
# user location
location = user_details.find(text='location').findNext('td').string.strip()
# age
age = user_details.find(text='age').findNext('td').string.strip()
# reputation
reputation = soup.find('span', {'class': 'summarycount'}).text
# questions
questions = soup.find('span', {'class': 'summarycount ar'}).text
# answers
answers = soup.find('div', {'class': 'summarycount ar'}).text
# votes
votesCast = soup.find('table', {'class': 'votes-cast-stats'})
upVotes = int(votesCast.td.text)
downVotes = int(votesCast.contents[5].contents[1].text)
totalVotes = upVotes + downVotes
# Some custom metrics
percentNice = (100/float(totalVotes))* upVotes
percentMean = (100/float(totalVotes))* downVotes
print 'name, birth, lastSeen, lifetime, age, location, reputation, questions, answers, totalVotes, upVotes, downVotes, percentNice, percentMean'
print name,',',birth,',',seen,',',lifetime,',',age,',',location,',',reputation,',',questions,',',answers,',',totalVotes,',',upVotes,',',downVotes,',',percentNice,',',percentMean
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment