Skip to content

Instantly share code, notes, and snippets.

@hktechn0
Created December 11, 2010 13:18
Show Gist options
  • Save hktechn0/737366 to your computer and use it in GitHub Desktop.
Save hktechn0/737366 to your computer and use it in GitHub Desktop.
Twitpic HTML Parser
#/usr/bin/env python
#-*- encode: utf-8 -*-
import HTMLParser
import urllib2
import time
class TwitpicParser(HTMLParser.HTMLParser):
baseurl = "http://twitpic.com/photos/"
def __init__(self, username):
HTMLParser.HTMLParser.__init__(self)
self.username = username
self._phototag = False
self._finish = False
self.ids = list()
def start(self, pagemax = None):
i = 1
while True:
page = urllib2.urlopen(self.baseurl + self.username + "?page=%d" % i).read()
self.feed(page)
self.reset()
if self._finish == True: break
i += 1
time.sleep(1)
return self.ids
def handle_starttag(self, tag, attrs):
if tag == "div" and ("class", "user-photo") in attrs:
self._phototag = True
if self._phototag and tag == "a":
photoid = dict(attrs)["href"][1:]
self._phototag = False
if photoid not in self.ids:
self.ids.append(photoid)
else:
self._finish = True
if __name__ == "__main__":
username = "yourname"
p = TwitpicParser(username)
ids = p.start()
print "Total: %d" % len(ids)
for i in ids: print i
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment