Last active
August 31, 2017 14:02
-
-
Save cshaley/507a2153536b00ad6883278e879127b9 to your computer and use it in GitHub Desktop.
List all conda forge packages and files on anaconda.org
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Goal is to list all conda forge packages and package files | |
# that are hosted on anaconda.org | |
# This program takes about 5 minutes to run locally. | |
# 75% of time is URL fetching | |
# 25% of time is pandas read_html (this could be cut significantly with custom code) | |
import requests | |
import time | |
from joblib import Parallel, delayed | |
import pandas as pd | |
def get_pkg_list_page(page): | |
"""Get list of all conda forge packages on a package list page""" | |
package_list_page = "https://anaconda.org/conda-forge/repo" | |
return pd.read_html(package_list_page + "?page=" + str(page))[0] | |
def get_package_file_list(pkg): | |
"""For each package, get a list of files on conda forge.""" | |
# Start by getting first page and total number of pages. | |
base_url = "https://anaconda.org/conda-forge/" | |
file_list_page = base_url + pkg + "/files" | |
flp = requests.get(file_list_page) | |
num_pages = int(flp.text.split('1 of ')[1].split('</a>')[0]) | |
# Put a list of all files associated with a package into a list of | |
# dataframes, with one dataframe created per page of files on | |
# anaconda.org | |
dfs = [pd.read_html(flp.text)[0]] | |
for page in range(2, num_pages + 1): | |
flp = requests.get(file_list_page + "?page=" + str(page)) | |
dfs.append(pd.read_html(flp.text)[0]) | |
# concat all dfs into one, so each package has one dataframe of files. | |
df = pd.concat(dfs).reset_index(drop=True) | |
df["Package Name"] = pkg | |
return df | |
if __name__ == "__main__": | |
## BEGIN PACKAGE LISTING | |
start_time = time.time() | |
# Start by getting first page and number of pages. | |
package_list_page = "https://anaconda.org/conda-forge/repo" | |
plp = requests.get(package_list_page) | |
num_pages = int(plp.text.split('1 of ')[1].split('</a>')[0]) | |
# Put a list of all packages together, run async | |
dfs = [pd.read_html(plp.text)[0]] | |
dfs1 = Parallel(n_jobs=8)(delayed(get_pkg_list_page)(page) for page in range(2, num_pages + 1)) | |
dfs = dfs + dfs1 | |
pkg_list = pd.concat(dfs).reset_index(drop=True) | |
pkg_list = pkg_list.rename(columns=lambda x: x.strip()) | |
print("Package list time, in seconds: ", time.time() - start_time) | |
# Dump to CSV | |
pkg_list.to_csv('pkg_list.csv', encoding='utf-8') | |
## END PACKAGE LISTING | |
## BEGIN FILE LISTING | |
start_time = time.time() | |
# For each package, download the list of files | |
dfs = Parallel(n_jobs=8)(delayed(get_package_file_list)(pkg) | |
for pkg in pkg_list['Package Name'].values) | |
file_list = pd.concat(dfs).drop("Unnamed: 0", axis=1).reset_index(drop=True) | |
print("File list time, in seconds: ", time.time() - start_time) | |
file_list.to_csv('file_list.csv', encoding='utf-8') | |
# END FILE LISTING |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment