Skip to content

Instantly share code, notes, and snippets.

@jochasinga
Created July 10, 2015 14:27
Show Gist options
  • Save jochasinga/2e9ca06706ee0482e9fc to your computer and use it in GitHub Desktop.
Save jochasinga/2e9ca06706ee0482e9fc to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import urllib2
import os
import re
from collections import deque
from bs4 import BeautifulSoup
base_url = "http://edlab.tc.columbia.edu/index.php?q=blog"
url_suffix = "&page={}"
def html_generator():
"""Send GET request and retrieve HTML from Edlab blog"""
count = 0
while True:
if count == 0:
response = urllib2.urlopen(base_url)
else:
response = urllib2.urlopen(base_url+url_suffix.format(count))
html = response.read().replace('\n', '')
yield html
# End scraping at page 50
if count >= 50:
break
else:
count += 1
def soup_generator():
for html in html_generator():
soup = BeautifulSoup(html, 'html.parser')
yield soup
def main():
post_info = {}
p = re.compile('\d')
# Loop through every soup (page)
for soup in soup_generator():
nodes_on_page = soup.select('div.node')
# Loop through every node (post)
for node in nodes_on_page:
# Get author's name
username = node.small.a.text
comment_node = node.select('li.comment_comments > a')
num_comment = 0
if len(comment_node) > 0:
comment_string = comment_node[0].text
# match only the number part so it can be incremented
m = p.match(comment_string)
num_comment = int(m.group()) if m else 0
# Increment number of comment on top of an author's existing
if username in post_info:
post_info[username] += num_comment
else:
post_info[username] = num_comment
for key, value in post_info.iteritems():
print('{0} | {1} comments'.format(key, str(value),))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment