Last active
October 16, 2020 20:59
-
-
Save Avantol13/a53cfc462e17ab3d132f81b8833c0d41 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Populate ACCESS User Management System with Datasets | |
Also generate commands for linking Gen3 Google Groups based on mapping file provided | |
* You need gen3 package so `pip install gen3` | |
* Download TSV of current production ACCESS list after logging in as Super Admin | |
* `generate_ACCESS_info` in `generate.py` to output a new `ACCESS_info.txt` | |
* A list of new/updated studies in the format like: | |
* phs_consent, authid, full_name | |
* Log into the commons, download an API key and set the filepath API_KEY_FILEPATH | |
* Run `create_datasets_in_ACCESS.py` | |
* this creates the necessary datasets | |
""" | |
import os | |
import requests | |
import logging | |
import sys | |
import csv | |
from gen3.submission import Gen3Submission | |
from gen3.auth import Gen3Auth | |
# Gen3 Commons URL | |
COMMONS = "https://internalstaging.theanvil.io/" | |
# download as CSV from AnVIL Data Tracking Sheet that has the Bucket Mapping Manifest | |
# of buckets -> phsids+consents | |
# https://docs.google.com/spreadsheets/d/1TMYqi50bRyoFdBS8WdvEueIBej6ZYMhEAMeFjiXFHOQ/edit#gid=1612104098 | |
BUCKET_MANIFEST_PATH = "~/Downloads/AnVIL Data Release Tracker - Internal - Bucket Mapping Manifest (1).csv" | |
# An API Key downloaded from the above commons' "Profile" page | |
API_KEY_FILEPATH = "~/Documents/anvil_credentials.json" | |
ACCESS_INFO_OUTPUT_FILE = "./ACCESS_info.txt" | |
ACCESS_TSV_EXPORT = "~/Downloads/users_2020-10-07.tsv" | |
ACCESS_API_ENDPOINT = COMMONS.rstrip("/") + "/access-backend" | |
logging.basicConfig(filename="create_datasets.log", level=logging.DEBUG) | |
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) | |
def generate_gen3_google_linking_commands(): | |
output_file = "anvil_google_link_commands.sh" | |
with open(output_file, "w+") as file: | |
with open(BUCKET_MANIFEST_PATH) as csvfile: | |
reader = csv.DictReader(csvfile, delimiter=",") | |
for row in reader: | |
print(row) | |
phsid = row.get("phsid", "").strip() | |
consent = row.get("consent", "").replace("-", "").strip() | |
bucket = row.get("bucket", "").strip() | |
if len(consent) == 1: | |
consent = "c" + consent | |
# skip if we don't have enough info | |
if not phsid or not consent or not bucket or len(consent) > 2: | |
continue | |
phs_consent = phsid + "." + consent | |
template = f"fence-create link-bucket-to-project --bucket_id {bucket} --bucket_provider google --project_auth_id {phs_consent}\n" | |
file.write(template) | |
def generate_ACCESS_info(): | |
auth = Gen3Auth(COMMONS, refresh_file=API_KEY_FILEPATH) | |
submission = Gen3Submission(COMMONS, auth_provider=auth) | |
print("getting currently submitted projects and phsids in preprod...") | |
query_txt = """ | |
{ | |
project(first:0) { | |
dbgap_accession_number | |
project_id | |
} | |
} | |
""" | |
results = submission.query(query_txt).get("data", {}).get("project", []) | |
existing_phs_consents = {} | |
with open(ACCESS_TSV_EXPORT, "r+") as access_file: | |
reader = csv.DictReader(access_file, delimiter="\t") | |
headers = list(reader.fieldnames) | |
for column_name in headers: | |
if "phs" in column_name: | |
project_id, phs_consent = column_name.strip().split(" ") | |
phs_consent = phs_consent.strip("(").strip(")") | |
existing_phs_consents[phs_consent] = project_id | |
print("parsing existing prod ACCESS datasets from downloaded TSV export...") | |
with open(ACCESS_INFO_OUTPUT_FILE, "w") as output: | |
print("determining which projects exist in preprod but not in ACCESS...") | |
for project in results: | |
split_phs = project["dbgap_accession_number"].split(".") | |
# don't handle non dbgap standard phsids | |
if len(split_phs) != 4: | |
print(f"ignoring {split_phs}") | |
continue | |
phs_consent = ".".join((split_phs[0], split_phs[3])) | |
project_id = project["project_id"] | |
if phs_consent not in existing_phs_consents.keys(): | |
output.write(f"{phs_consent} {phs_consent} {project_id}\n") | |
elif not project_id == existing_phs_consents[phs_consent]: | |
print( | |
f"existing project_id {existing_phs_consents[phs_consent]} does not match new project_id {project_id} for {phs_consent}" | |
) | |
print(f"wrote new projects to {ACCESS_INFO_OUTPUT_FILE}") | |
def add_ACCESS_datasets(): | |
auth = Gen3Auth(COMMONS, refresh_file=API_KEY_FILEPATH) | |
datasets = [] | |
with open(ACCESS_INFO_OUTPUT_FILE, "r") as file: | |
for line in file: | |
phs_consent, authid, full_name = line.strip("\n").split(" ") | |
try: | |
program, project = full_name.split("-", 1) | |
except ValueError: | |
# nothing to split, assume program | |
program = full_name | |
project = "N/A" | |
payload = { | |
"name": f"{full_name}", | |
"phsid": f"{phs_consent}", | |
"authid": f"{authid}", | |
"program": f"{program}", | |
"project": f"{project}", | |
} | |
datasets.append(payload) | |
for dataset in datasets: | |
# logging.info(f"deleting then creating {dataset}...") | |
logging.info(f"creating {dataset}...") | |
headers = {"Authorization": auth._get_auth_value()} | |
# response = requests.delete( | |
# ACCESS_API_ENDPOINT + f"/datasets/{dataset['phsid']}", headers=headers | |
# ) | |
response = requests.post( | |
ACCESS_API_ENDPOINT + "/datasets", json=dataset, headers=headers | |
) | |
logging.info(f"response: {response.text}") | |
if __name__ == "__main__": | |
generate_ACCESS_info() | |
add_ACCESS_datasets() | |
generate_gen3_google_linking_commands() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment