Avantol13 · October 16, 2020 20:59
diff --git a/add_access_datasets.py b/add_access_datasets.py
 """
 Populate ACCESS User Management System with Datasets
 Also generate commands for linking Gen3 Google Groups based on mapping file provided

 * You need gen3 package so `pip install gen3`
 * Download TSV of current production ACCESS list after logging in as Super Admin
 * `generate_ACCESS_info` in `generate.py` to output a new `ACCESS_info.txt`
    * A list of new/updated studies in the format like:
        * phs_consent, authid, full_name
 * Log into the commons, download an API key and set the filepath API_KEY_FILEPATH
 * Run `create_datasets_in_ACCESS.py`
    * this creates the necessary datasets
 """
 import os
 import requests
 import logging
 import sys
 import csv

 from gen3.submission import Gen3Submission
 from gen3.auth import Gen3Auth

 # Gen3 Commons URL
 COMMONS = "https://internalstaging.theanvil.io/"

 # download as CSV from AnVIL Data Tracking Sheet that has the Bucket Mapping Manifest
 # of buckets -> phsids+consents
 # https://docs.google.com/spreadsheets/d/1TMYqi50bRyoFdBS8WdvEueIBej6ZYMhEAMeFjiXFHOQ/edit#gid=1612104098
 BUCKET_MANIFEST_PATH = "~/Downloads/AnVIL Data Release Tracker - Internal - Bucket Mapping Manifest (1).csv"

 # An API Key downloaded from the above commons' "Profile" page
 API_KEY_FILEPATH = "~/Documents/anvil_credentials.json"

 ACCESS_INFO_OUTPUT_FILE = "./ACCESS_info.txt"
 ACCESS_TSV_EXPORT = "~/Downloads/users_2020-10-07.tsv"
 ACCESS_API_ENDPOINT = COMMONS.rstrip("/") + "/access-backend"

 logging.basicConfig(filename="create_datasets.log", level=logging.DEBUG)
 logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))



 def generate_gen3_google_linking_commands():
    output_file = "anvil_google_link_commands.sh"
    with open(output_file, "w+") as file:
        with open(BUCKET_MANIFEST_PATH) as csvfile:
            reader = csv.DictReader(csvfile, delimiter=",")
            for row in reader:
                print(row)
                phsid = row.get("phsid", "").strip()
                consent = row.get("consent", "").replace("-", "").strip()
                bucket = row.get("bucket", "").strip()

                if len(consent) == 1:
                    consent = "c" + consent

                # skip if we don't have enough info
                if not phsid or not consent or not bucket or len(consent) > 2:
                    continue

                phs_consent = phsid + "." + consent

                template = f"fence-create link-bucket-to-project --bucket_id {bucket} --bucket_provider google --project_auth_id {phs_consent}\n"
                file.write(template)


 def generate_ACCESS_info():
    auth = Gen3Auth(COMMONS, refresh_file=API_KEY_FILEPATH)
    submission = Gen3Submission(COMMONS, auth_provider=auth)

    print("getting currently submitted projects and phsids in preprod...")
    query_txt = """
 {
  project(first:0) {
    dbgap_accession_number
    project_id
  }
 }
    """
    results = submission.query(query_txt).get("data", {}).get("project", [])

    existing_phs_consents = {}
    with open(ACCESS_TSV_EXPORT, "r+") as access_file:
        reader = csv.DictReader(access_file, delimiter="\t")
        headers = list(reader.fieldnames)
        for column_name in headers:
            if "phs" in column_name:
                project_id, phs_consent = column_name.strip().split(" ")
                phs_consent = phs_consent.strip("(").strip(")")
                existing_phs_consents[phs_consent] = project_id

    print("parsing existing prod ACCESS datasets from downloaded TSV export...")
    with open(ACCESS_INFO_OUTPUT_FILE, "w") as output:
        print("determining which projects exist in preprod but not in ACCESS...")
        for project in results:
            split_phs = project["dbgap_accession_number"].split(".")

            # don't handle non dbgap standard phsids
            if len(split_phs) != 4:
                print(f"ignoring {split_phs}")
                continue

            phs_consent = ".".join((split_phs[0], split_phs[3]))
            project_id = project["project_id"]

            if phs_consent not in existing_phs_consents.keys():
                output.write(f"{phs_consent} {phs_consent} {project_id}\n")
            elif not project_id == existing_phs_consents[phs_consent]:
                print(
                    f"existing project_id {existing_phs_consents[phs_consent]} does not match new project_id {project_id} for {phs_consent}"
                )

    print(f"wrote new projects to {ACCESS_INFO_OUTPUT_FILE}")


 def add_ACCESS_datasets():
    auth = Gen3Auth(COMMONS, refresh_file=API_KEY_FILEPATH)

    datasets = []
    with open(ACCESS_INFO_OUTPUT_FILE, "r") as file:
        for line in file:
            phs_consent, authid, full_name = line.strip("\n").split(" ")
            try:
                program, project = full_name.split("-", 1)
            except ValueError:
                # nothing to split, assume program
                program = full_name
                project = "N/A"

            payload = {
                "name": f"{full_name}",
                "phsid": f"{phs_consent}",
                "authid": f"{authid}",
                "program": f"{program}",
                "project": f"{project}",
            }
            datasets.append(payload)

    for dataset in datasets:
        # logging.info(f"deleting then creating {dataset}...")
        logging.info(f"creating {dataset}...")
        headers = {"Authorization": auth._get_auth_value()}
        # response = requests.delete(
        #     ACCESS_API_ENDPOINT + f"/datasets/{dataset['phsid']}", headers=headers
        # )
        response = requests.post(
            ACCESS_API_ENDPOINT + "/datasets", json=dataset, headers=headers
        )
        logging.info(f"response: {response.text}")


 if __name__ == "__main__":
    generate_ACCESS_info()
    add_ACCESS_datasets()
    generate_gen3_google_linking_commands()
	"""
	Populate ACCESS User Management System with Datasets
	Also generate commands for linking Gen3 Google Groups based on mapping file provided

	* You need gen3 package so `pip install gen3`
	* Download TSV of current production ACCESS list after logging in as Super Admin
	* `generate_ACCESS_info` in `generate.py` to output a new `ACCESS_info.txt`
	* A list of new/updated studies in the format like:
	* phs_consent, authid, full_name
	* Log into the commons, download an API key and set the filepath API_KEY_FILEPATH
	* Run `create_datasets_in_ACCESS.py`
	* this creates the necessary datasets
	"""
	import os
	import requests
	import logging
	import sys
	import csv

	from gen3.submission import Gen3Submission
	from gen3.auth import Gen3Auth

	# Gen3 Commons URL
	COMMONS = "https://internalstaging.theanvil.io/"

	# download as CSV from AnVIL Data Tracking Sheet that has the Bucket Mapping Manifest
	# of buckets -> phsids+consents
	# https://docs.google.com/spreadsheets/d/1TMYqi50bRyoFdBS8WdvEueIBej6ZYMhEAMeFjiXFHOQ/edit#gid=1612104098
	BUCKET_MANIFEST_PATH = "~/Downloads/AnVIL Data Release Tracker - Internal - Bucket Mapping Manifest (1).csv"

	# An API Key downloaded from the above commons' "Profile" page
	API_KEY_FILEPATH = "~/Documents/anvil_credentials.json"

	ACCESS_INFO_OUTPUT_FILE = "./ACCESS_info.txt"
	ACCESS_TSV_EXPORT = "~/Downloads/users_2020-10-07.tsv"
	ACCESS_API_ENDPOINT = COMMONS.rstrip("/") + "/access-backend"

	logging.basicConfig(filename="create_datasets.log", level=logging.DEBUG)
	logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))



	def generate_gen3_google_linking_commands():
	output_file = "anvil_google_link_commands.sh"
	with open(output_file, "w+") as file:
	with open(BUCKET_MANIFEST_PATH) as csvfile:
	reader = csv.DictReader(csvfile, delimiter=",")
	for row in reader:
	print(row)
	phsid = row.get("phsid", "").strip()
	consent = row.get("consent", "").replace("-", "").strip()
	bucket = row.get("bucket", "").strip()

	if len(consent) == 1:
	consent = "c" + consent

	# skip if we don't have enough info
	if not phsid or not consent or not bucket or len(consent) > 2:
	continue

	phs_consent = phsid + "." + consent

	template = f"fence-create link-bucket-to-project --bucket_id {bucket} --bucket_provider google --project_auth_id {phs_consent}\n"
	file.write(template)


	def generate_ACCESS_info():
	auth = Gen3Auth(COMMONS, refresh_file=API_KEY_FILEPATH)
	submission = Gen3Submission(COMMONS, auth_provider=auth)

	print("getting currently submitted projects and phsids in preprod...")
	query_txt = """
	{
	project(first:0) {
	dbgap_accession_number
	project_id
	}
	}
	"""
	results = submission.query(query_txt).get("data", {}).get("project", [])

	existing_phs_consents = {}
	with open(ACCESS_TSV_EXPORT, "r+") as access_file:
	reader = csv.DictReader(access_file, delimiter="\t")
	headers = list(reader.fieldnames)
	for column_name in headers:
	if "phs" in column_name:
	project_id, phs_consent = column_name.strip().split(" ")
	phs_consent = phs_consent.strip("(").strip(")")
	existing_phs_consents[phs_consent] = project_id

	print("parsing existing prod ACCESS datasets from downloaded TSV export...")
	with open(ACCESS_INFO_OUTPUT_FILE, "w") as output:
	print("determining which projects exist in preprod but not in ACCESS...")
	for project in results:
	split_phs = project["dbgap_accession_number"].split(".")

	# don't handle non dbgap standard phsids
	if len(split_phs) != 4:
	print(f"ignoring {split_phs}")
	continue

	phs_consent = ".".join((split_phs[0], split_phs[3]))
	project_id = project["project_id"]

	if phs_consent not in existing_phs_consents.keys():
	output.write(f"{phs_consent} {phs_consent} {project_id}\n")
	elif not project_id == existing_phs_consents[phs_consent]:
	print(
	f"existing project_id {existing_phs_consents[phs_consent]} does not match new project_id {project_id} for {phs_consent}"
	)

	print(f"wrote new projects to {ACCESS_INFO_OUTPUT_FILE}")


	def add_ACCESS_datasets():
	auth = Gen3Auth(COMMONS, refresh_file=API_KEY_FILEPATH)

	datasets = []
	with open(ACCESS_INFO_OUTPUT_FILE, "r") as file:
	for line in file:
	phs_consent, authid, full_name = line.strip("\n").split(" ")
	try:
	program, project = full_name.split("-", 1)
	except ValueError:
	# nothing to split, assume program
	program = full_name
	project = "N/A"

	payload = {
	"name": f"{full_name}",
	"phsid": f"{phs_consent}",
	"authid": f"{authid}",
	"program": f"{program}",
	"project": f"{project}",
	}
	datasets.append(payload)

	for dataset in datasets:
	# logging.info(f"deleting then creating {dataset}...")
	logging.info(f"creating {dataset}...")
	headers = {"Authorization": auth._get_auth_value()}
	# response = requests.delete(
	# ACCESS_API_ENDPOINT + f"/datasets/{dataset['phsid']}", headers=headers
	# )
	response = requests.post(
	ACCESS_API_ENDPOINT + "/datasets", json=dataset, headers=headers
	)
	logging.info(f"response: {response.text}")


	if __name__ == "__main__":
	generate_ACCESS_info()
	add_ACCESS_datasets()
	generate_gen3_google_linking_commands()