Get from http://kin-y.github.io/miningReviewRepo/
python3 GetFileList.py gm_openstack user passwd
mkdir revision_files
python3 RequestFileDiff.py gm_openstack https://review.openstack.org start end --from-ini
Get from http://kin-y.github.io/miningReviewRepo/
python3 GetFileList.py gm_openstack user passwd
mkdir revision_files
python3 RequestFileDiff.py gm_openstack https://review.openstack.org start end --from-ini
#!/usr/bin/env python3 | |
""" | |
Get file list from mysql | |
Usage: | |
$ python3 src/GetFileList.py gm_openstack user passwd | |
Output: | |
./gm_openstack.csv | |
- "ch_id": Change id | |
- "rev_id": Revision id | |
- "f_file_name": Encoded file path | |
""" | |
import sys | |
import csv | |
from urllib.parse import quote_plus | |
from collections import defaultdict | |
import MySQLdb | |
def main(): | |
""" | |
Main | |
""" | |
# set argument | |
argv = sys.argv | |
argc = len(argv) | |
if argc == 4: | |
current_db = argv[1] | |
user = argv[2] | |
passwd = argv[3] | |
else: | |
current_db = "gm_openstack" | |
user = "root" | |
passwd = "" | |
# Define dictionary | |
t_revision_dic = defaultdict(lambda: []) | |
t_file_dic = defaultdict(lambda: []) | |
# Connect DB | |
connection = MySQLdb.connect(db=current_db, user=user, passwd=passwd) | |
cursor = connection.cursor() | |
# Get changes | |
sys.stdout.write("\rCollecting changes...") | |
sql = "SELECT id, ch_Id, ch_changeId \ | |
FROM t_change" | |
cursor.execute(sql) | |
changes = cursor.fetchall() | |
# Get revisions | |
sys.stdout.write("\rCollecting revisions...") | |
sql = "SELECT id, rev_Id, rev_changeId, rev_patchSetNum \ | |
FROM t_revision" | |
cursor.execute(sql) | |
revisions = cursor.fetchall() | |
# Get files | |
sys.stdout.write("\rCollecting files...") | |
sql = "SELECT f_fileName, f_revisionId \ | |
FROM t_file" | |
cursor.execute(sql) | |
files = cursor.fetchall() | |
# Close DB connection | |
connection.close() | |
# Store data into t_revisionDic | |
for revision in revisions: | |
t_revision_dic[revision[2]].append(revision) | |
for rev_file in files: | |
t_file_dic[int(rev_file[1])].append(rev_file) | |
# File list for output | |
output_files = [] | |
# Search from changes | |
changes_len = len(changes) | |
for i, change in enumerate(changes): | |
ch_revisions = t_revision_dic[change[0]] | |
ch_id = change[1] | |
ch_change_id = change[2] | |
revisions_len = len(ch_revisions) | |
# Search from revisions | |
for j, revision in enumerate(ch_revisions): | |
rev_files = t_file_dic[revision[0]] | |
rev_id = revision[1] | |
rev_change_id = revision[2] | |
rev_patch_set_num = revision[3] | |
output_files += [[ch_id, ch_change_id, | |
rev_id, rev_change_id, | |
quote_plus(rev_file[0]), rev_patch_set_num] | |
for rev_file in rev_files] | |
sys.stdout.write("\rChange: %d / %d, Revision: %d / %d" % | |
(i, changes_len, j, revisions_len)) | |
# Output | |
with open(current_db + ".csv", 'w') as csvfile: | |
writer = csv.writer(csvfile, lineterminator='\n') | |
sys.stdout.write("\rOutputting files...") | |
writer.writerow(["ch_id", "ch_change_id", | |
"rev_id", "rev_change_id", | |
"f_file_name", "rev_patchSetNum"]) | |
writer.writerows(output_files) | |
if __name__ == '__main__': | |
main() |
#!/usr/bin/env python3 | |
""" | |
Get file revised from csv | |
""" | |
from csv import DictReader | |
from sys import argv, stdout | |
from os import mkdir, path, error | |
from time import sleep | |
from requests import get, exceptions | |
USAGE = "Usage: python3 src/RequestFileDiff.py current_db requests_header start end\ | |
[--from-ini] [--from-prev]" | |
FROM_BASE = 0 | |
FROM_INI = 1 | |
FROM_PREV = 2 | |
def main(): | |
""" | |
Main | |
""" | |
base_mode = FROM_BASE | |
if "--from-ini" in argv: | |
base_mode = FROM_INI | |
argv.remove("--from-ini") | |
elif "--from-prev" in argv: | |
base_mode = FROM_PREV | |
argv.remove("--from-prev") | |
if len(argv) != 5 or "-h" in argv or "--help" in argv: | |
print(USAGE) | |
return | |
# Set argument | |
current_db = argv[1] | |
requests_header = argv[2] # exp) https://review.openstack.org | |
start = int(argv[3]) | |
end = int(argv[4]) | |
# Make project's directory | |
projects_path = "./revision_files/" + current_db | |
if not path.exists(projects_path): | |
mkdir(projects_path) | |
with open(current_db + ".csv", 'r') as csvfile: | |
reader = DictReader(csvfile, lineterminator='\n') | |
for i, rev_file in enumerate(reader, start=1): | |
if i >= start: | |
break | |
for i, rev_file in enumerate(reader, start=start): | |
if i > end: | |
break | |
f_file_name = str(rev_file["f_file_name"]) | |
rev_patch_set_num = str(rev_file["rev_patchSetNum"]) | |
requests_url = "/".join([requests_header, | |
"changes", str(rev_file["ch_id"]), | |
"revisions", rev_patch_set_num, | |
"files", f_file_name, | |
"diff"]) | |
params = make_param_from(int(rev_patch_set_num), base_mode) | |
for _ in range(1, 5): | |
try: | |
response = get(requests_url, params=params) | |
if response.status_code != 200: | |
print("\n" + str(i) + ": " + requests_url + " "+ str(response.status_code)) | |
if response.status_code == 404: | |
break | |
sleep(30) | |
continue | |
except exceptions.RequestException as err: | |
print("\n" + str(i) + ": " + str(err)) | |
sleep(30) | |
else: | |
break | |
response.encoding = 'utf-8' | |
# Output | |
revisions_path = "/".join([projects_path, rev_file["rev_id"]]) | |
if not path.exists(revisions_path): | |
mkdir(revisions_path) | |
try: | |
with open("/".join([revisions_path, f_file_name + ".json"]), 'w') as rev_file: | |
rev_file.write(response.text) | |
except error: | |
print("\nOS Error") | |
continue | |
stdout.write("\rFile: %d / %d" % (i, end)) | |
def make_param_from(rev_patch_set_num, base_mode): | |
""" | |
Return requests parameter | |
""" | |
if rev_patch_set_num == 1 or base_mode == FROM_BASE: | |
return None | |
elif base_mode == FROM_INI: | |
return {"base": "1"} | |
elif base_mode == FROM_PREV: | |
return {"base": str(rev_patch_set_num-1)} | |
if __name__ == '__main__': | |
main() |