Skip to content

Instantly share code, notes, and snippets.

@immuntasir
Created October 3, 2020 13:49
Show Gist options
  • Save immuntasir/6c2caab78ba85ac15b05ea49b0f420d8 to your computer and use it in GitHub Desktop.
Save immuntasir/6c2caab78ba85ac15b05ea49b0f420d8 to your computer and use it in GitHub Desktop.
Downloding github repositories
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"from tqdm import tqdm\n",
"import pandas as pd\n",
"from github import Github\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"with open('../../api_keys/github.txt', \"r\") as f:\n",
" API_KEY = f.read()\n",
" \n",
"git_client = Github(API_KEY)\n",
"\n",
"ext_set = set(['ipnyb', 'py'])\n",
"REPO_DIR_PARENT = '../../data/package_popularity/numpy/clones/'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"repo_df = pd.read_csv('../../data/package_popularity/numpy/repo_info.csv')\n",
"repo_df = repo_df[repo_df['stargazers_count'] >= 100]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def get_relevant_files (full_name, git_client, ext_set):\n",
" repo = git_client.get_repo(full_name)\n",
" contents = repo.get_contents(\"\")\n",
" files = []\n",
" \n",
" while contents:\n",
" file_content = contents.pop(0)\n",
" if file_content.type == \"dir\":\n",
" contents.extend(repo.get_contents(file_content.path))\n",
" elif file_content.name.split('.')[-1] in ext_set:\n",
" files.append((file_content.name, file_content.download_url))\n",
" \n",
" return files"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for name, full_name in tqdm(repo_df[['name', 'full_name']].values):\n",
" repo_dir = os.path.join(REPO_DIR_PARENT, '_'.join(full_name.split('/')))\n",
" if os.path.exists(repo_dir):\n",
" pass\n",
" else:\n",
" try:\n",
" os.mkdir(repo_dir)\n",
" files = get_relevant_files(full_name, git_client, ext_set)\n",
"\n",
" for name, download_url in files:\n",
" r = requests.get(download_url, allow_redirects=True)\n",
" open(os.path.join(repo_dir, name), 'wb').write(r.content)\n",
" \n",
" except Exception as ex:\n",
" print('Exception', name, full_name, ex)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment