Last active
March 19, 2020 20:21
-
-
Save millerdev/79d34a26a2f325125a32c65dd5315c50 to your computer and use it in GitHub Desktop.
Categorize CommCare HQ domains needing couch -> sql migration of forms and cases
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
""" | |
DEPRECATED moved to | |
https://github.com/dimagi/commcare-hq/blob/master/corehq/apps/couch_sql_migration/management/commands/couch_domain_report.py | |
Categorize CommCare HQ domains needing couch -> sql migration of forms and cases | |
This script requires a domains.csv file, which can be obtained by following | |
these steps: | |
1. Download Excel file containing domains using CouchDB for forms and cases | |
from https://www.commcarehq.org/hq/admin/domains/?es_use_sql_backend=F | |
2. Open the file and remove the (last three) summary rows. | |
3. Export to CSV format. | |
The output is a set of text files listing domains in each category. | |
Output filenames are formatted as "{ORIGIN}_{category}.txt" | |
This script lives at | |
https://gist.github.com/millerdev/79d34a26a2f325125a32c65dd5315c50 | |
Usage: | |
bucket_domains.py [options] DOMAINS_CSV ORIGIN | |
-h --help Show this help. | |
--output-dir=PATH Output directory [default: .] | |
--plot Show a pie graph of domain categories. | |
Requires extra setup: | |
python3 -m venv /path/to/virtualenv | |
# activate newly craeted virtualenv | |
pip install docopt pandas matplotlib | |
""" | |
import sys | |
from os.path import join | |
try: | |
import pandas as pd | |
from docopt import docopt | |
except ImportError: | |
print("Pre-setup required:\npip install docopt pandas") | |
sys.exit(1) | |
NOW = pd.datetime.now() | |
def main(): | |
args = docopt(__doc__) | |
data = pd.read_csv(args["DOMAINS_CSV"]) | |
origin = args["ORIGIN"] | |
output_dir = args["--output-dir"] | |
categories = {} | |
all_domains = len(data) | |
total_domains = 0 | |
print("{:<15} {:>6} {:>12} {}".format( | |
"category", | |
"domains", | |
"submissions", | |
"migration time", | |
)) | |
print("--") | |
for name, query in [ | |
("weird", get_weird), # mistakes? | |
("small", get_small(2000)), | |
("smallish", get_small(3500)), | |
("smallesque", get_small(5000)), | |
("fossilized", get_old(years=5)), | |
("inactive", get_old(years=1, limit=10000)), | |
("inactive_large", get_old(years=1, limit=100000)), | |
("super_large", get_large(720000)), | |
("large", get_large(60000)), | |
("moderate", lambda d: d), | |
]: | |
categories[name] = query(data) | |
data = left_xor(data, categories[name]) | |
if name == "weird": | |
forms = time_to_complete = "unknown" | |
else: | |
forms = categories[name]['# Form Submissions'].apply(pd.to_numeric).sum() | |
time_to_complete = get_time_to_complete(forms) | |
total_domains += len(categories[name]) | |
print("{:<15} {:>6} {:>12} {}".format( | |
name, | |
len(categories[name]), | |
forms, | |
time_to_complete, | |
)) | |
path = join(output_dir, "%s_%s.txt" % (origin, name)) | |
with open(path, "w", encoding="utf-8") as fh: | |
fh.write("\n".join(categories[name]['Project'])) | |
assert total_domains == all_domains, (total_domains, all_domains) | |
print("--") | |
print("total {:>16}".format(total_domains)) | |
if args["--plot"]: | |
show_pie_graph(categories) | |
def get_weird(data): | |
return data[data["# Active Mobile Workers"] == "Not yet calculated"] | |
def get_small(upper_limit): | |
def query(data): | |
return data[data['# Form Submissions'].apply(pd.to_numeric) < upper_limit] | |
return query | |
def get_large(lower_limit): | |
def query(data): | |
return data[data['# Form Submissions'].apply(pd.to_numeric) > lower_limit] | |
return query | |
def get_old(*, years, limit=None): | |
years_ago = NOW.replace(year=NOW.year - years) | |
def query(data): | |
where = data['Last Form Submission'].apply(pd.to_datetime) <= years_ago | |
if limit is not None: | |
where &= data['# Form Submissions'].apply(pd.to_numeric) < limit | |
return data[where] | |
return query | |
def left_xor(left, right): | |
merged = left.merge(right, indicator=True, how='outer') | |
merged = merged[merged['_merge'] == 'left_only'] | |
return merged.drop('_merge', 1) | |
def get_time_to_complete(forms): | |
# Estimated migration throughput is about 24 forms/sec in the first | |
# phase. Other phases add extra time. Estimate low here. And always | |
# add more buffer time if/when sharing these numbers. | |
num = forms / 10 # 10 forms/sec overall | |
for divisor, name, next_div in [ | |
(60, "minutes", 60), | |
(60, "hours", 24), | |
(24, "days", 7), | |
(7, "weeks", 4), | |
]: | |
num = num / divisor | |
if num < next_div * 2: | |
break | |
return "%.1f %s" % (num, name) | |
def show_pie_graph(categories): | |
import matplotlib.pyplot as plt | |
sizes = [] | |
labels = [] | |
for name, data in categories.items(): | |
sizes.append(len(data)) | |
labels.append(name + " (%d)" % len(data)) | |
fig1, ax1 = plt.subplots() | |
ax1.pie(sizes, labels=labels, startangle=90) | |
ax1.axis('equal') | |
plt.show() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment