Skip to content

Instantly share code, notes, and snippets.

@sangheestyle
Last active August 29, 2015 13:58
Show Gist options
  • Save sangheestyle/9980418 to your computer and use it in GitHub Desktop.
Save sangheestyle/9980418 to your computer and use it in GitHub Desktop.
Fetching repo informations from github based on query
import os
import json
import sys
from multiprocessing import Pool
from subprocess import call, check_output
from datetime import datetime, date, timedelta
from time import sleep
import github3 as github
"""
Warning:
You might need to check folloing variables before running this script.
base_query: query for github search, http://goo.gl/fp9ELC
login: your github id
password: your github password
request_delay_time: rate-limit, https://developer.github.com/v3/search/#rate-limit
"""
base_query = "android in:AndroidManifest.xml language:java"
login = "YOUR_GITHUB_ID"
password = "YOUR_GITHUB_PASSWORD"
request_delay_time = 5
def get_dates(start_date, end_date):
d1 = datetime.strptime(start_date, '%Y-%m-%d')
d2 = datetime.strptime(end_date, '%Y-%m-%d')
delta = d2 - d1
dates = []
for i in range(delta.days + 1):
current_date = d1 + timedelta(days=i)
dates.append(str(current_date)[:10])
return dates
def get_search_result(query):
gh = github.login(login, password)
result = gh.search_repositories(query)
search_result = []
for repository_search_result in result:
json_data = repository_search_result.repository.to_json()
search_result.append(json_data)
return search_result
def get_range_search_result(start_date, end_date, base_query):
dates = get_dates(start_date, end_date)
range_search_result = []
for the_date in dates:
query = base_query + " created:" + the_date
print ">>> doing query:", query
search_result = get_search_result(query)
range_search_result.extend(search_result)
print "<<< done:", \
"current:", len(search_result), \
"total:", len(range_search_result)
sleep(request_delay_time)
return range_search_result
def write_json_to_file(json_data, file_name):
with open(file_name, "wb") as fp:
json.dump(json_data, fp)
def clone_repo(clone_url, full_name):
call(["git", "clone", clone_url, full_name])
def clone_repo_helper(args):
return clone_repo(*args)
def clone_multiple_repo(clone_url_full_name_pair_list, number_of_processes):
p = Pool(number_of_processes)
p.map(clone_repo_helper, clone_url_full_name_pair_list)
def get_json_value(json_contents, *fields):
value_list = []
for data in json_contents:
values = []
for field in fields:
values.append(data[field])
value_list.append(values)
return value_list
def read_json_file(file_path):
with open(file_path, 'r') as fp:
json_data = json.load(fp)
return json_data
def write_list_to_text(the_list, file_name):
with open(file_name, "wb") as fp:
for item in the_list:
fp.write("%s\n" % item)
def read_repo_list(path):
with open(path, "r") as fp:
lines = fp.read().splitlines()
return lines
def run_git_command(path, git_command):
current_path = os.getcwd()
os.chdir(path)
git_command = git_command.split()
basename = check_output(["git", "rev-parse", "--show-toplevel"])
print ">>>", basename
call(git_command)
print "<<<"
os.chdir(current_path)
if __name__=='__main__':
"""
start_date = sys.argv[1]
end_date = sys.argv[2]
file_name = sys.argv[3]
result = get_range_search_result(start_date, end_date, base_query)
write_json_to_file(result, file_name)
"""
json_contents = read_json_file(sys.argv[1])
value_list = get_json_value(json_contents, "clone_url", "full_name")
output_path = "repo"
current_dir = os.getcwd()
if not os.path.isdir(output_path):
os.mkdir(output_path)
os.chdir(output_path)
print os.getcwd()
clone_multiple_repo(value_list, 4)
os.chdir(current_dir)
@sangheestyle
Copy link
Author

Usage

$ python repo.py 2010-01-01 2010-12-31 repo_2010.txt

Plan

  • Save search result into a text file
  • Present summary stat based on the text file
  • Clone repos based on 'clone_url's
  • Analyze 100 repos and more than 10000

@sangheestyle
Copy link
Author

Use code such as following for cloning repos

def clone_repo(repo_info):
    current_dir = os.getcwd()
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    os.chdir(output_path)
    call(["git", "clone", repo_info["clone_url"], repo_info["full_name"]])
    os.chdir(current_dir)

also do it with following code

 if __name__=='__main__':
     p = Pool(number_of_workers)
     p.map(clone_repo, repositories)

@sangheestyle
Copy link
Author

Todo

Split the code by purpose.

  • search github repos
  • store it into text file
  • clone repos
  • analyze (b)
  • analyze repos

@sangheestyle
Copy link
Author

import os
from subprocess import check_output, CalledProcessError


def get_repo_paths(root=None):
    repo_paths = []
    for dir, _, _ in os.walk(root):
        if dir.endswith('/.git'):
            repo_paths.append(dir[:-5])
    return repo_paths


def get_commit_ids_by_permission(repo_path=None, permission=None):
    if permission is None:
        permission = '.*'

    current_path = os.getcwd()
    os.chdir(repo_path)
    template_git_command = ["git", "log", "--pretty=format:%h"]
    permission_rex = "<.*uses-permission.*\." + permission + ".*/>"
    template_permission = ["-G", permission_rex]
    template_path = ["--", "*/AndroidManifest.xml"]
    command = []
    command.extend(template_git_command)
    command.extend(template_permission)
    command.extend(template_path)
    output = check_output(command)
    try:
        output = check_output(command)
    except CalledProcessError:
        print "AAAAAAAAAAAAAAA"
        commit_ids = []
    os.chdir(current_path)
    commit_ids = output.split()
    return commit_ids

root = '../repo'
for repo_path in get_repo_paths(root):
    print ">>>", repo_path
    print len(get_commit_ids_by_permission(repo_path))
print "!!"

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment