Created
December 9, 2012 13:31
-
-
Save sanand0/4244899 to your computer and use it in GitHub Desktop.
Scrapes pypi.python.org modules into a CSV file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
*.csv | |
.cache |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Scrape <http://pypi.python.org/> into a CSV file listing | |
- updated date | |
- package size | |
- downloads | |
By default, it scrapes the Scientific packages. | |
""" | |
import os | |
import sys | |
import csv | |
import urllib | |
import hashlib | |
import lxml.html | |
if not os.path.exists('.cache'): | |
os.makedirs('.cache') | |
def get(url): | |
filename = '.cache/' + hashlib.sha1(url).hexdigest() | |
if not os.path.exists(filename): | |
urllib.urlretrieve(url, filename) | |
return lxml.html.parse(filename) | |
URL = 'http://pypi.python.org/pypi?:action=browse&show=all&c=385' # Scientific packages | |
# URL = 'http://pypi.python.org/pypi?:action=index' # All packages | |
tree = get(URL) | |
packages = [tr.find('.//a') for tr in tree.findall('.//table[@class="list"]//tr')[1:-1]] | |
out = csv.writer(sys.stdout, lineterminator='\n') | |
for package in packages: | |
tree = get('http://pypi.python.org' + package.get('href')) | |
updated, size, downloads = [], [], [] | |
rows = tree.findall('.//table[@class="list"]//tr')[1:-1] | |
for row in rows: | |
cells = row.findall('.//td') | |
updated.append(cells[-3].text) | |
size.append(int(cells[-2].text.replace('MB', '000000').replace('KB', '000').replace('B', ''))) | |
downloads.append(int(cells[-1].text)) | |
if len(rows): | |
out.writerow([package.get('href'), max(updated), max(size), max(downloads)]) | |
sys.stdout.flush() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment