Skip to content

Instantly share code, notes, and snippets.

@Avantol13
Last active October 16, 2020 20:59
Show Gist options
  • Save Avantol13/a53cfc462e17ab3d132f81b8833c0d41 to your computer and use it in GitHub Desktop.
Save Avantol13/a53cfc462e17ab3d132f81b8833c0d41 to your computer and use it in GitHub Desktop.
"""
Populate ACCESS User Management System with Datasets
Also generate commands for linking Gen3 Google Groups based on mapping file provided
* You need gen3 package so `pip install gen3`
* Download TSV of current production ACCESS list after logging in as Super Admin
* `generate_ACCESS_info` in `generate.py` to output a new `ACCESS_info.txt`
* A list of new/updated studies in the format like:
* phs_consent, authid, full_name
* Log into the commons, download an API key and set the filepath API_KEY_FILEPATH
* Run `create_datasets_in_ACCESS.py`
* this creates the necessary datasets
"""
import os
import requests
import logging
import sys
import csv
from gen3.submission import Gen3Submission
from gen3.auth import Gen3Auth
# Gen3 Commons URL
COMMONS = "https://internalstaging.theanvil.io/"
# download as CSV from AnVIL Data Tracking Sheet that has the Bucket Mapping Manifest
# of buckets -> phsids+consents
# https://docs.google.com/spreadsheets/d/1TMYqi50bRyoFdBS8WdvEueIBej6ZYMhEAMeFjiXFHOQ/edit#gid=1612104098
BUCKET_MANIFEST_PATH = "~/Downloads/AnVIL Data Release Tracker - Internal - Bucket Mapping Manifest (1).csv"
# An API Key downloaded from the above commons' "Profile" page
API_KEY_FILEPATH = "~/Documents/anvil_credentials.json"
ACCESS_INFO_OUTPUT_FILE = "./ACCESS_info.txt"
ACCESS_TSV_EXPORT = "~/Downloads/users_2020-10-07.tsv"
ACCESS_API_ENDPOINT = COMMONS.rstrip("/") + "/access-backend"
logging.basicConfig(filename="create_datasets.log", level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
def generate_gen3_google_linking_commands():
output_file = "anvil_google_link_commands.sh"
with open(output_file, "w+") as file:
with open(BUCKET_MANIFEST_PATH) as csvfile:
reader = csv.DictReader(csvfile, delimiter=",")
for row in reader:
print(row)
phsid = row.get("phsid", "").strip()
consent = row.get("consent", "").replace("-", "").strip()
bucket = row.get("bucket", "").strip()
if len(consent) == 1:
consent = "c" + consent
# skip if we don't have enough info
if not phsid or not consent or not bucket or len(consent) > 2:
continue
phs_consent = phsid + "." + consent
template = f"fence-create link-bucket-to-project --bucket_id {bucket} --bucket_provider google --project_auth_id {phs_consent}\n"
file.write(template)
def generate_ACCESS_info():
auth = Gen3Auth(COMMONS, refresh_file=API_KEY_FILEPATH)
submission = Gen3Submission(COMMONS, auth_provider=auth)
print("getting currently submitted projects and phsids in preprod...")
query_txt = """
{
project(first:0) {
dbgap_accession_number
project_id
}
}
"""
results = submission.query(query_txt).get("data", {}).get("project", [])
existing_phs_consents = {}
with open(ACCESS_TSV_EXPORT, "r+") as access_file:
reader = csv.DictReader(access_file, delimiter="\t")
headers = list(reader.fieldnames)
for column_name in headers:
if "phs" in column_name:
project_id, phs_consent = column_name.strip().split(" ")
phs_consent = phs_consent.strip("(").strip(")")
existing_phs_consents[phs_consent] = project_id
print("parsing existing prod ACCESS datasets from downloaded TSV export...")
with open(ACCESS_INFO_OUTPUT_FILE, "w") as output:
print("determining which projects exist in preprod but not in ACCESS...")
for project in results:
split_phs = project["dbgap_accession_number"].split(".")
# don't handle non dbgap standard phsids
if len(split_phs) != 4:
print(f"ignoring {split_phs}")
continue
phs_consent = ".".join((split_phs[0], split_phs[3]))
project_id = project["project_id"]
if phs_consent not in existing_phs_consents.keys():
output.write(f"{phs_consent} {phs_consent} {project_id}\n")
elif not project_id == existing_phs_consents[phs_consent]:
print(
f"existing project_id {existing_phs_consents[phs_consent]} does not match new project_id {project_id} for {phs_consent}"
)
print(f"wrote new projects to {ACCESS_INFO_OUTPUT_FILE}")
def add_ACCESS_datasets():
auth = Gen3Auth(COMMONS, refresh_file=API_KEY_FILEPATH)
datasets = []
with open(ACCESS_INFO_OUTPUT_FILE, "r") as file:
for line in file:
phs_consent, authid, full_name = line.strip("\n").split(" ")
try:
program, project = full_name.split("-", 1)
except ValueError:
# nothing to split, assume program
program = full_name
project = "N/A"
payload = {
"name": f"{full_name}",
"phsid": f"{phs_consent}",
"authid": f"{authid}",
"program": f"{program}",
"project": f"{project}",
}
datasets.append(payload)
for dataset in datasets:
# logging.info(f"deleting then creating {dataset}...")
logging.info(f"creating {dataset}...")
headers = {"Authorization": auth._get_auth_value()}
# response = requests.delete(
# ACCESS_API_ENDPOINT + f"/datasets/{dataset['phsid']}", headers=headers
# )
response = requests.post(
ACCESS_API_ENDPOINT + "/datasets", json=dataset, headers=headers
)
logging.info(f"response: {response.text}")
if __name__ == "__main__":
generate_ACCESS_info()
add_ACCESS_datasets()
generate_gen3_google_linking_commands()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment