Skip to content

Instantly share code, notes, and snippets.

@cshaley
Last active August 31, 2017 14:02
Show Gist options
  • Save cshaley/507a2153536b00ad6883278e879127b9 to your computer and use it in GitHub Desktop.
Save cshaley/507a2153536b00ad6883278e879127b9 to your computer and use it in GitHub Desktop.
List all conda forge packages and files on anaconda.org
# Goal is to list all conda forge packages and package files
# that are hosted on anaconda.org
# This program takes about 5 minutes to run locally.
# 75% of time is URL fetching
# 25% of time is pandas read_html (this could be cut significantly with custom code)
import requests
import time
from joblib import Parallel, delayed
import pandas as pd
def get_pkg_list_page(page):
"""Get list of all conda forge packages on a package list page"""
package_list_page = "https://anaconda.org/conda-forge/repo"
return pd.read_html(package_list_page + "?page=" + str(page))[0]
def get_package_file_list(pkg):
"""For each package, get a list of files on conda forge."""
# Start by getting first page and total number of pages.
base_url = "https://anaconda.org/conda-forge/"
file_list_page = base_url + pkg + "/files"
flp = requests.get(file_list_page)
num_pages = int(flp.text.split('1 of ')[1].split('</a>')[0])
# Put a list of all files associated with a package into a list of
# dataframes, with one dataframe created per page of files on
# anaconda.org
dfs = [pd.read_html(flp.text)[0]]
for page in range(2, num_pages + 1):
flp = requests.get(file_list_page + "?page=" + str(page))
dfs.append(pd.read_html(flp.text)[0])
# concat all dfs into one, so each package has one dataframe of files.
df = pd.concat(dfs).reset_index(drop=True)
df["Package Name"] = pkg
return df
if __name__ == "__main__":
## BEGIN PACKAGE LISTING
start_time = time.time()
# Start by getting first page and number of pages.
package_list_page = "https://anaconda.org/conda-forge/repo"
plp = requests.get(package_list_page)
num_pages = int(plp.text.split('1 of ')[1].split('</a>')[0])
# Put a list of all packages together, run async
dfs = [pd.read_html(plp.text)[0]]
dfs1 = Parallel(n_jobs=8)(delayed(get_pkg_list_page)(page) for page in range(2, num_pages + 1))
dfs = dfs + dfs1
pkg_list = pd.concat(dfs).reset_index(drop=True)
pkg_list = pkg_list.rename(columns=lambda x: x.strip())
print("Package list time, in seconds: ", time.time() - start_time)
# Dump to CSV
pkg_list.to_csv('pkg_list.csv', encoding='utf-8')
## END PACKAGE LISTING
## BEGIN FILE LISTING
start_time = time.time()
# For each package, download the list of files
dfs = Parallel(n_jobs=8)(delayed(get_package_file_list)(pkg)
for pkg in pkg_list['Package Name'].values)
file_list = pd.concat(dfs).drop("Unnamed: 0", axis=1).reset_index(drop=True)
print("File list time, in seconds: ", time.time() - start_time)
file_list.to_csv('file_list.csv', encoding='utf-8')
# END FILE LISTING
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment