Last active
August 29, 2015 13:58
-
-
Save sangheestyle/9980418 to your computer and use it in GitHub Desktop.
Fetching repo informations from github based on query
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import json | |
import sys | |
from multiprocessing import Pool | |
from subprocess import call, check_output | |
from datetime import datetime, date, timedelta | |
from time import sleep | |
import github3 as github | |
""" | |
Warning: | |
You might need to check folloing variables before running this script. | |
base_query: query for github search, http://goo.gl/fp9ELC | |
login: your github id | |
password: your github password | |
request_delay_time: rate-limit, https://developer.github.com/v3/search/#rate-limit | |
""" | |
base_query = "android in:AndroidManifest.xml language:java" | |
login = "YOUR_GITHUB_ID" | |
password = "YOUR_GITHUB_PASSWORD" | |
request_delay_time = 5 | |
def get_dates(start_date, end_date): | |
d1 = datetime.strptime(start_date, '%Y-%m-%d') | |
d2 = datetime.strptime(end_date, '%Y-%m-%d') | |
delta = d2 - d1 | |
dates = [] | |
for i in range(delta.days + 1): | |
current_date = d1 + timedelta(days=i) | |
dates.append(str(current_date)[:10]) | |
return dates | |
def get_search_result(query): | |
gh = github.login(login, password) | |
result = gh.search_repositories(query) | |
search_result = [] | |
for repository_search_result in result: | |
json_data = repository_search_result.repository.to_json() | |
search_result.append(json_data) | |
return search_result | |
def get_range_search_result(start_date, end_date, base_query): | |
dates = get_dates(start_date, end_date) | |
range_search_result = [] | |
for the_date in dates: | |
query = base_query + " created:" + the_date | |
print ">>> doing query:", query | |
search_result = get_search_result(query) | |
range_search_result.extend(search_result) | |
print "<<< done:", \ | |
"current:", len(search_result), \ | |
"total:", len(range_search_result) | |
sleep(request_delay_time) | |
return range_search_result | |
def write_json_to_file(json_data, file_name): | |
with open(file_name, "wb") as fp: | |
json.dump(json_data, fp) | |
def clone_repo(clone_url, full_name): | |
call(["git", "clone", clone_url, full_name]) | |
def clone_repo_helper(args): | |
return clone_repo(*args) | |
def clone_multiple_repo(clone_url_full_name_pair_list, number_of_processes): | |
p = Pool(number_of_processes) | |
p.map(clone_repo_helper, clone_url_full_name_pair_list) | |
def get_json_value(json_contents, *fields): | |
value_list = [] | |
for data in json_contents: | |
values = [] | |
for field in fields: | |
values.append(data[field]) | |
value_list.append(values) | |
return value_list | |
def read_json_file(file_path): | |
with open(file_path, 'r') as fp: | |
json_data = json.load(fp) | |
return json_data | |
def write_list_to_text(the_list, file_name): | |
with open(file_name, "wb") as fp: | |
for item in the_list: | |
fp.write("%s\n" % item) | |
def read_repo_list(path): | |
with open(path, "r") as fp: | |
lines = fp.read().splitlines() | |
return lines | |
def run_git_command(path, git_command): | |
current_path = os.getcwd() | |
os.chdir(path) | |
git_command = git_command.split() | |
basename = check_output(["git", "rev-parse", "--show-toplevel"]) | |
print ">>>", basename | |
call(git_command) | |
print "<<<" | |
os.chdir(current_path) | |
if __name__=='__main__': | |
""" | |
start_date = sys.argv[1] | |
end_date = sys.argv[2] | |
file_name = sys.argv[3] | |
result = get_range_search_result(start_date, end_date, base_query) | |
write_json_to_file(result, file_name) | |
""" | |
json_contents = read_json_file(sys.argv[1]) | |
value_list = get_json_value(json_contents, "clone_url", "full_name") | |
output_path = "repo" | |
current_dir = os.getcwd() | |
if not os.path.isdir(output_path): | |
os.mkdir(output_path) | |
os.chdir(output_path) | |
print os.getcwd() | |
clone_multiple_repo(value_list, 4) | |
os.chdir(current_dir) |
Todo
Split the code by purpose.
- search github repos
- store it into text file
- clone repos
- analyze (b)
- analyze repos
import os
from subprocess import check_output, CalledProcessError
def get_repo_paths(root=None):
repo_paths = []
for dir, _, _ in os.walk(root):
if dir.endswith('/.git'):
repo_paths.append(dir[:-5])
return repo_paths
def get_commit_ids_by_permission(repo_path=None, permission=None):
if permission is None:
permission = '.*'
current_path = os.getcwd()
os.chdir(repo_path)
template_git_command = ["git", "log", "--pretty=format:%h"]
permission_rex = "<.*uses-permission.*\." + permission + ".*/>"
template_permission = ["-G", permission_rex]
template_path = ["--", "*/AndroidManifest.xml"]
command = []
command.extend(template_git_command)
command.extend(template_permission)
command.extend(template_path)
output = check_output(command)
try:
output = check_output(command)
except CalledProcessError:
print "AAAAAAAAAAAAAAA"
commit_ids = []
os.chdir(current_path)
commit_ids = output.split()
return commit_ids
root = '../repo'
for repo_path in get_repo_paths(root):
print ">>>", repo_path
print len(get_commit_ids_by_permission(repo_path))
print "!!"
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Use code such as following for cloning repos
also do it with following code