Created
July 10, 2015 14:27
-
-
Save jochasinga/2e9ca06706ee0482e9fc to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import urllib2 | |
import os | |
import re | |
from collections import deque | |
from bs4 import BeautifulSoup | |
base_url = "http://edlab.tc.columbia.edu/index.php?q=blog" | |
url_suffix = "&page={}" | |
def html_generator(): | |
"""Send GET request and retrieve HTML from Edlab blog""" | |
count = 0 | |
while True: | |
if count == 0: | |
response = urllib2.urlopen(base_url) | |
else: | |
response = urllib2.urlopen(base_url+url_suffix.format(count)) | |
html = response.read().replace('\n', '') | |
yield html | |
# End scraping at page 50 | |
if count >= 50: | |
break | |
else: | |
count += 1 | |
def soup_generator(): | |
for html in html_generator(): | |
soup = BeautifulSoup(html, 'html.parser') | |
yield soup | |
def main(): | |
post_info = {} | |
p = re.compile('\d') | |
# Loop through every soup (page) | |
for soup in soup_generator(): | |
nodes_on_page = soup.select('div.node') | |
# Loop through every node (post) | |
for node in nodes_on_page: | |
# Get author's name | |
username = node.small.a.text | |
comment_node = node.select('li.comment_comments > a') | |
num_comment = 0 | |
if len(comment_node) > 0: | |
comment_string = comment_node[0].text | |
# match only the number part so it can be incremented | |
m = p.match(comment_string) | |
num_comment = int(m.group()) if m else 0 | |
# Increment number of comment on top of an author's existing | |
if username in post_info: | |
post_info[username] += num_comment | |
else: | |
post_info[username] = num_comment | |
for key, value in post_info.iteritems(): | |
print('{0} | {1} comments'.format(key, str(value),)) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment