cshaley · August 31, 2017 14:02
diff --git a/conda_forge_lister.py b/conda_forge_lister.py
 # Goal is to list all conda forge packages and package files
 # that are hosted on anaconda.org
 # This program takes about 5 minutes to run locally.
 # 75% of time is URL fetching
 # 25% of time is pandas read_html (this could be cut significantly with custom code)

 import requests
 import time

 from joblib import Parallel, delayed
 import pandas as pd


 def get_pkg_list_page(page):
    """Get list of all conda forge packages on a package list page"""
    package_list_page = "https://anaconda.org/conda-forge/repo"
    return pd.read_html(package_list_page + "?page=" + str(page))[0]


 def get_package_file_list(pkg):
    """For each package, get a list of files on conda forge."""
    # Start by getting first page and total number of pages.
    base_url = "https://anaconda.org/conda-forge/"
    file_list_page = base_url + pkg + "/files"
    flp = requests.get(file_list_page)
    num_pages = int(flp.text.split('1 of ')[1].split('</a>')[0])
    
    # Put a list of all files associated with a package into a list of
    # dataframes, with one dataframe created per page of files on
    # anaconda.org
    dfs = [pd.read_html(flp.text)[0]]
    for page in range(2, num_pages + 1):
        flp = requests.get(file_list_page + "?page=" + str(page))
        dfs.append(pd.read_html(flp.text)[0])

    # concat all dfs into one, so each package has one dataframe of files.
    df = pd.concat(dfs).reset_index(drop=True)
    df["Package Name"] = pkg
    return df


 if __name__ == "__main__":
    ## BEGIN PACKAGE LISTING
    start_time = time.time()

    # Start by getting first page and number of pages.
    package_list_page = "https://anaconda.org/conda-forge/repo"
    plp = requests.get(package_list_page)
    num_pages = int(plp.text.split('1 of ')[1].split('</a>')[0])

    # Put a list of all packages together, run async
    dfs = [pd.read_html(plp.text)[0]]
    dfs1 = Parallel(n_jobs=8)(delayed(get_pkg_list_page)(page) for page in range(2, num_pages + 1))
    dfs = dfs + dfs1
    pkg_list = pd.concat(dfs).reset_index(drop=True)
    pkg_list = pkg_list.rename(columns=lambda x: x.strip())
    print("Package list time, in seconds: ", time.time() - start_time)
    
    # Dump to CSV
    pkg_list.to_csv('pkg_list.csv', encoding='utf-8')
    ## END PACKAGE LISTING

    ## BEGIN FILE LISTING
    start_time = time.time()
    # For each package, download the list of files
    dfs = Parallel(n_jobs=8)(delayed(get_package_file_list)(pkg) 
        for pkg in pkg_list['Package Name'].values)
    file_list = pd.concat(dfs).drop("Unnamed: 0", axis=1).reset_index(drop=True)
    print("File list time, in seconds: ", time.time() - start_time)

    file_list.to_csv('file_list.csv', encoding='utf-8')
    # END FILE LISTING
	# Goal is to list all conda forge packages and package files
	# that are hosted on anaconda.org
	# This program takes about 5 minutes to run locally.
	# 75% of time is URL fetching
	# 25% of time is pandas read_html (this could be cut significantly with custom code)

	import requests
	import time

	from joblib import Parallel, delayed
	import pandas as pd


	def get_pkg_list_page(page):
	"""Get list of all conda forge packages on a package list page"""
	package_list_page = "https://anaconda.org/conda-forge/repo"
	return pd.read_html(package_list_page + "?page=" + str(page))[0]


	def get_package_file_list(pkg):
	"""For each package, get a list of files on conda forge."""
	# Start by getting first page and total number of pages.
	base_url = "https://anaconda.org/conda-forge/"
	file_list_page = base_url + pkg + "/files"
	flp = requests.get(file_list_page)
	num_pages = int(flp.text.split('1 of ')[1].split('</a>')[0])

	# Put a list of all files associated with a package into a list of
	# dataframes, with one dataframe created per page of files on
	# anaconda.org
	dfs = [pd.read_html(flp.text)[0]]
	for page in range(2, num_pages + 1):
	flp = requests.get(file_list_page + "?page=" + str(page))
	dfs.append(pd.read_html(flp.text)[0])

	# concat all dfs into one, so each package has one dataframe of files.
	df = pd.concat(dfs).reset_index(drop=True)
	df["Package Name"] = pkg
	return df


	if __name__ == "__main__":
	## BEGIN PACKAGE LISTING
	start_time = time.time()

	# Start by getting first page and number of pages.
	package_list_page = "https://anaconda.org/conda-forge/repo"
	plp = requests.get(package_list_page)
	num_pages = int(plp.text.split('1 of ')[1].split('</a>')[0])

	# Put a list of all packages together, run async
	dfs = [pd.read_html(plp.text)[0]]
	dfs1 = Parallel(n_jobs=8)(delayed(get_pkg_list_page)(page) for page in range(2, num_pages + 1))
	dfs = dfs + dfs1
	pkg_list = pd.concat(dfs).reset_index(drop=True)
	pkg_list = pkg_list.rename(columns=lambda x: x.strip())
	print("Package list time, in seconds: ", time.time() - start_time)

	# Dump to CSV
	pkg_list.to_csv('pkg_list.csv', encoding='utf-8')
	## END PACKAGE LISTING

	## BEGIN FILE LISTING
	start_time = time.time()
	# For each package, download the list of files
	dfs = Parallel(n_jobs=8)(delayed(get_package_file_list)(pkg)
	for pkg in pkg_list['Package Name'].values)
	file_list = pd.concat(dfs).drop("Unnamed: 0", axis=1).reset_index(drop=True)
	print("File list time, in seconds: ", time.time() - start_time)

	file_list.to_csv('file_list.csv', encoding='utf-8')
	# END FILE LISTING