Created
July 31, 2008 00:57
-
-
Save jeremyBanks/3367 to your computer and use it in GitHub Desktop.
[2010-01] reddit fib thread cralwer, probably broken now
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Because I was unable to find the root of the Fibbonachi sequence post | |
# chain in the original story comments (it was nowhere to be found!) I made | |
# this little hacky script to go back, post by post, until it reache a post | |
# with no parent. For fun, it displays the users and values as it goes. | |
# The value isn't always accurate, but that doesn't really matter. | |
# The result of this (which didn't take very long) it stopped, not very | |
# far back, because someone deleted a post. Huh. Irritating. | |
# http://www.reddit.com/comments/2mg72/vote_up_if_you_love_pie/c02bfe6 | |
def main(): | |
import sys | |
import urllib2 | |
import re | |
import time | |
linkTemplate = "http://www.reddit.com/comments/2mg72/vote_up_if_you_love_pie/%s" | |
current = "c04vmt1" | |
while True: | |
while True: | |
try: | |
page = urllib2.urlopen(linkTemplate % current).read() | |
except: | |
print "urlopen() failed, retrying in 2s." | |
sleep(2) | |
else: | |
break | |
pattern = re.compile("""<a id="author_[^"]*%s".*?>(.*?)</a>.*?<div id="body_[^"]*%s".*?><div class="md">(.*?)</div>.*?<a id="parent_[^"]*%s"[^>]*? href="[^"]*/([0-9a-zA-Z]+)".""" % (current, current, current), re.DOTALL) | |
try: | |
user, post, parentID = pattern.findall(page)[0] | |
except: | |
break | |
number = int(re.sub("[^0-9]", "", post)) | |
print "%24s@%s: %.1e" % (user, current, number) | |
current = parentID | |
print "Root post appears to be %s." % current | |
if __name__ == "__main__": sys.exit(main()) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment