Skip to content

Instantly share code, notes, and snippets.

@nmaier
Created January 30, 2012 23:32
Show Gist options
  • Save nmaier/1707509 to your computer and use it in GitHub Desktop.
Save nmaier/1707509 to your computer and use it in GitHub Desktop.
import re
from urllib2 import urlopen, Request, URLError
from html5lib import parse as html5
url = "https://addons.mozilla.org/en-US/editors/reviewlog?start=2011-12-22&end=&page={0}"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:12.0a1) Gecko/20120117 Firefox/12.0a1",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "de-de,de;q=0.8,en-us;q=0.5,en;q=0.3",
"Referer": "https://addons.mozilla.org/en-US/editors/reviewlog",
"Cookie": "<set me plox>"
}
def saniint(t):
return re.sub(r'\D', '', t.strip())
def parse(page):
u = url.format(page)
try:
r = Request(u, headers=headers)
content = urlopen(r).read()
dom = html5(content,
treebuilder="lxml",
namespaceHTMLElements=False
)
cur = saniint(dom.xpath(r"//body/div/div/div/div/div/strong[2]")[0].text)
total = saniint(dom.xpath(r"//body/div/div/div/div/div/strong[3]")[0].text)
reviews = [e.text.strip() for e in dom.xpath(r'//*[@id = "log-listing"]/tbody/tr[td/a[contains(text(), "prelimi") or contains(text(), "approved") or contains(text(), "rejected")]]/td[3]')]
return cur, total, reviews
except URLError, ex:
print ex
reviewers = {}
i = 0
while True:
i += 1
cur, total, reviews = parse(i)
print i, cur, total
for r in reviews:
try:
reviewers[r] += 1
except KeyError:
reviewers[r] = 1
if cur == total:
break
reviewers = sorted(((k.encode("utf-8"), v) for (k,v) in reviewers.items()),
key=lambda x: x[1])
for k,v in reversed(reviewers):
print k, v
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment