d0choa · September 30, 2024 09:42
diff --git a/2021_approvals.csv b/2021_approvals.csv
diff --git a/2021_approvals.R b/2021_approvals.R
 library("tidyverse")
 library("sparklyr")
 library("sparklyr.nested")
 library("cowplot")
 library("ggsci")

 #Spark config
 config <- spark_config()

 # Allowing to GCP datasets access
 config$spark.hadoop.fs.gs.requester.pays.mode <- "AUTO" # nolint
 config$spark.hadoop.fs.gs.requester.pays.project.id <- "open-targets-eu-dev" # nolint

 # spark connect
 sc <- spark_connect(master = "yarn", config = config)

 # Approvals as reported in NRDD article
 gs_approvals <- "gs://ot-team/dochoa/2021_approvals.csv"
 approvals <- spark_read_csv(
    sc,
    path = gs_approvals,
    memory = FALSE
 )

 # Datasource metadata
 ds_names <- spark_read_csv(
    sc,
    path = "gs://ot-team/dochoa/datasourceMetadata.csv",
    memory = FALSE) %>%
    collect()

 # Read Platform data
 gs_path <- "gs://open-targets-data-releases/"
 data_release <- "21.11"
 all_evidence_path <- paste(
    gs_path, data_release,
    "/output/etl/parquet/evidence/",
    sep = ""
 )
 moa_path <- paste(
    gs_path, data_release,
    "/output/etl/parquet/mechanismOfAction/",
    sep = ""
 )
 ass_indirectby_ds_path <- paste(
    gs_path, data_release,
    "/output/etl/parquet/associationByDatasourceIndirect/",
    sep = ""
 )
 disease_path <- paste(
    gs_path, data_release,
    "/output/etl/parquet/diseases/",
    sep = ""
 )
 interaction_path <- paste(
    gs_path, data_release,
    "/output/etl/parquet/interaction/",
    sep = ""
 )
 disease2phenotype_path <- paste(
    gs_path, data_release,
    "/output/etl/parquet/diseaseToPhenotype/",
    sep = ""
 )

 # Mechanisms of action
 # Extra MoAs required to fill the gaps
 ammend_moas <- list(
    "CHEMBL4594302" = "ENSG00000134318",
    "CHEMBL4297741" = "ENSG00000215644",
    "CHEMBL4297774" = "ENSG00000146648",
    "CHEMBL4297774" = "ENSG00000105976",
    "CHEMBL4298185" = "ENSG00000112964", # chembl missing in platform
    "CHEMBL4650319" = "ENSG00000146648",
    "CHEMBL1863514" = "ENSG00000166183",
    "CHEMBL4594320" = "ENSG00000171298"
 )
 new_moas <- data.frame(
    chemblIds = names(ammend_moas),
    targetId = unlist(ammend_moas)
 )
 new_moas <- sdf_copy_to(sc, new_moas, overwrite = TRUE)

 # available MoAs + ammended
 moa <- spark_read_parquet(sc, moa_path, memory = FALSE) %>%
    select(chemblIds, targets) %>%
    sdf_explode(chemblIds) %>%
    sdf_explode(targets) %>%
    rename(targetId = targets) %>%
    sdf_distinct() %>%
    sdf_bind_rows(new_moas)

 # Platform ssociations indirect (by datasource)
 ass_indirectby_ds <- spark_read_parquet(sc, ass_indirectby_ds_path)

 # Joining associations information
 ass <- approvals %>%
    rename(diseaseId = DiseaseId) %>%
    left_join(moa, by = c("DrugId" = "chemblIds")) %>%
    left_join(ass_indirectby_ds, by = c("diseaseId", "targetId")) %>%
    collect()

 # Data about molecular interactions
 interactions <- spark_read_parquet(sc, interaction_path, memory = FALSE) %>%
    filter(sourceDatabase == "intact") %>%
    filter(!is.na(targetA)) %>%
    filter(!is.na(targetB)) %>%
    filter(scoring > 0.42) %>%
    select(targetA, targetB) %>%
    sdf_distinct()

 interactors_ass <- approvals %>%
    rename(diseaseId = DiseaseId) %>%
    inner_join(moa, by = c("DrugId" = "chemblIds")) %>%
    inner_join(interactions, by = c("targetId" = "targetA")) %>%
    inner_join(
        ass_indirectby_ds,
        by = c("diseaseId" = "diseaseId", "targetB" = "targetId")
    ) %>%
    select(datasourceId, Drug_brand_name) %>%
    sdf_distinct() %>%
    collect() %>%
    mutate(interactionAssociation = TRUE)

 # Additional phenotype curation
 ammend_phenotypes <- list(
    # Microalbuminuria (biomarker of CKD)
    "EFO_0000401" = "HP_0012594",
    # glycodeoxycholate sulfate (one of the bile acids that cause pruritus)
    "Orphanet_172" = "EFO_0005653",
    "Orphanet_52" = "EFO_0005653",
    # achondroplasia -> body height
    "Orphanet_15" = "EFO_0004339",
    "Orphanet_15" = "Orphanet_329191",
    #von hippel lindau -> renal carcinoma
    "Orphanet_892" = "EFO_0000681",
    "EFO_0001360" = "MONDO_0018582",
    # growth delay -> height
    "HP_0001510" = "EFO_0004339",
    #CAD -> myocardial infarctation
    "EFO_0001645" = "EFO_0000612"
 )
 new_phenotypes <- data.frame(
    diseaseId = names(ammend_phenotypes),
    phenotype = unlist(ammend_phenotypes)
 )
 new_phenotypes <- sdf_copy_to(sc, new_phenotypes, overwrite = TRUE)

 # Platform disease to phenotype data
 disease2phenotype <- spark_read_parquet(
    sc,
    disease2phenotype_path,
    memory = FALSE
 ) %>%
    select(diseaseId = disease, phenotype) %>%
    sdf_distinct()

 # Associations through indirect phenotypes
 phenotype_ass <- approvals %>%
    rename(diseaseId = DiseaseId) %>%
    inner_join(moa, by = c("DrugId" = "chemblIds")) %>%
    inner_join(
        disease2phenotype %>%
        sdf_bind_rows(new_phenotypes),
        by = c("diseaseId")) %>%
    inner_join(
        ass_indirectby_ds,
        by = c("phenotype" = "diseaseId", "targetId")) %>%
    select(datasourceId, Drug_brand_name) %>%
    sdf_distinct() %>%
    collect() %>%
    mutate(phenotypeAssociation = TRUE)

 # Data to plot
 data2plot <- ass %>%
    select(datasourceId, Drug_brand_name, score) %>%
    complete(datasourceId, Drug_brand_name) %>%
    mutate(score = replace_na(score, 0)) %>%
    filter(!is.na(datasourceId)) %>%
    # TA
    left_join(
        ass %>%
            select(
                Drug_brand_name,
                TA
            ) %>%
            distinct(),
        by = "Drug_brand_name"
    ) %>%
    # targets
    left_join(
        ass %>%
            mutate(noTarget = is.na(targetId)) %>%
            select(
                Drug_brand_name,
                noTarget
            ) %>%
            distinct(),
        by = "Drug_brand_name"
    ) %>%
    # interactions
    left_join(
        interactors_ass,
        by = c("datasourceId", "Drug_brand_name")
    ) %>%
    # related phenotypes
    left_join(
        phenotype_ass,
        by = c("datasourceId", "Drug_brand_name")
    ) %>%
    mutate(
        interactionAssociation = ifelse(score > 0, TRUE, interactionAssociation)
    ) %>%
    mutate(
        phenotypeAssociation = ifelse(score > 0, TRUE, phenotypeAssociation)
    ) %>%
    mutate(score = ifelse(noTarget, NA, score)) %>%
    mutate(TA = ifelse(noTarget, "No human target", TA)) %>%
    mutate(
        TA = fct_other(
            TA,
            keep = c("Oncology", "No human target"),
            other_level = "Other indication"
        )
    ) %>%
    mutate(
        TA = fct_relevel(TA, c(
            "Oncology",
            "Other indication",
            "No human target"
        ))
    ) %>%
    # mutate(datasourceId = fct_relevel(datasourceId, names(ds_name_list))) %>%
    filter(!(datasourceId %in% c("chembl", "expression_atlas", "sysbio", "europepmc", "phenodigm", "reactome", "phewas_catalog"))) %>%
    #drug score for the purpose of reordering them
    mutate(rankscore = replace_na(score, 0)) %>%
    mutate(rankscore = ifelse(!is.na(interactionAssociation), rankscore + 0.01, rankscore)) %>%
    mutate(rankscore = ifelse(!is.na(phenotypeAssociation), rankscore + 0.03, rankscore)) %>%
    mutate(Drug_brand_name = fct_rev(fct_reorder(
        Drug_brand_name, rankscore, mean,
        na.rm = TRUE, .desc = TRUE
    ))) %>%
    group_by(
        datasourceId,
        Drug_brand_name,
        TA,
        noTarget,
        interactionAssociation,
        phenotypeAssociation
    ) %>%
    summarise(score = suppressWarnings(max(score, na.rm = TRUE))) %>%
    mutate(score = ifelse(score < 0, NA, score)) %>%
    left_join(ds_names, by = "datasourceId") %>%
    mutate(
        datasourceName = factor(datasourceName, levels = ds_names$datasourceName),
        datasourceType = factor(datasourceType, levels = c("Somatic", "Functional genomics (cancer)", "Rare mendelian", "Common disease"))
    )


 # symbols to overlay in the plot
 overlay_data <- data2plot %>%
    ungroup() %>%
    select(
        datasourceName,
        datasourceType,
        Drug_brand_name,
        TA,
        interactionAssociation,
        phenotypeAssociation
    ) %>%
    gather("overlay", "value", -datasourceName, -datasourceType, -Drug_brand_name, -TA) %>%
    filter(!is.na(value)) %>%
    mutate(overlay = str_replace_all(overlay, "Association", "")) %>%
    mutate(overlaySize = ifelse(overlay == "phenotype", 3, 1)) %>%
    mutate(overlaySymbol = as.character(ifelse(overlay == "phenotype", 1, 16)))

 # plotting
 output <- data2plot %>%
    ggplot(aes(
        x = datasourceName,
        y = Drug_brand_name)) +
    geom_tile(aes(fill = score), color = "white") +
    geom_point(data = overlay_data,
        aes(shape = overlay, size = overlaySize)) +
    scale_fill_material("blue",
        na.value = "grey90",
        name = "Direct association"
    ) +
    scale_shape_manual(
        breaks = c("phenotype", "interaction"),
        labels = c("Direct or related phenotype", "Direct or interacting protein"),
        values = c(1, 16),
        name = "Supported by:") +
    scale_size_identity() +
    facet_grid(TA ~ datasourceType, scales = "free", space = "free") +
    theme_cowplot(font_size = 12) +
    # labs(
    #     title = "Supporting evidence on 2021 FDA drug approvals",
    #     subtitle = "Target-Disease evidence from Open Targets"
    #     # caption =
    #     #     "Source: Nat Reviews Drug Discovery 10.1038/d41573-022-00001-9"
    # ) +
    theme(
        plot.background = element_rect(fill = "white"),
        strip.background = element_blank(),
        legend.direction = "horizontal",
        legend.box = "vertical",
        legend.position = c(-0.7, -0.16),
        legend.justification = c(0, 0),
        axis.ticks = element_blank(),
        axis.text.x = element_text(angle = 45, hjust = 1),
        axis.title = element_blank(),
        axis.line = element_blank(),
        text = element_text(family = "sans")
    ) +
    guides(
        fill = guide_colourbar(
            title.position = "top",
            title.hjust = 0.5,
            barwidth = 8,
            frame.colour = "black",
            ticks.colour = "black",
            order = 2
            ),
        shape = guide_legend(
            title.position = "top",
            direction = "vertical",
            order = 1
        )
    )
 ggsave(
    "/home/ochoa/2021_approvals.pdf",
    plot = output,
    width = 9,
    height = 11
 )
diff --git a/2021_approvals_brief.r b/2021_approvals_brief.r
 library("tidyverse")
 library("sparklyr")
 library("sparklyr.nested")
 library("cowplot")
 library("ggsci")

 #Spark config
 config <- spark_config()

 # Allowing to GCP datasets access
 config$spark.hadoop.fs.gs.requester.pays.mode <- "AUTO" # nolint
 config$spark.hadoop.fs.gs.requester.pays.project.id <- "open-targets-eu-dev" # nolint

 # spark connect
 sc <- spark_connect(master = "local", config = config)

 # Approvals as reported in NRDD article
 gs_approvals <- "gs://ot-team/dochoa/2021_approvals.csv"
 approvals <- spark_read_csv(
    sc,
    path = gs_approvals,
    memory = FALSE
 )

 # Datasource metadata
 ds_names <- spark_read_csv(
    sc,
    path = "gs://ot-team/dochoa/datasourceMetadata.csv",
    memory = FALSE) %>%
    collect()

 # Read Platform data
 gs_path <- "gs://open-targets-data-releases/"
 data_release <- "21.11"
 all_evidence_path <- paste(
    gs_path, data_release,
    "/output/etl/parquet/evidence/",
    sep = ""
 )
 moa_path <- paste(
    gs_path, data_release,
    "/output/etl/parquet/mechanismOfAction/",
    sep = ""
 )
 ass_indirectby_ds_path <- paste(
    gs_path, data_release,
    "/output/etl/parquet/associationByDatasourceIndirect/",
    sep = ""
 )
 disease_path <- paste(
    gs_path, data_release,
    "/output/etl/parquet/diseases/",
    sep = ""
 )
 interaction_path <- paste(
    gs_path, data_release,
    "/output/etl/parquet/interaction/",
    sep = ""
 )
 disease2phenotype_path <- paste(
    gs_path, data_release,
    "/output/etl/parquet/diseaseToPhenotype/",
    sep = ""
 )

 # Mechanisms of action
 # Extra MoAs required to fill the gaps
 ammend_moas <- list(
    "CHEMBL4594302" = "ENSG00000134318",
    "CHEMBL4297741" = "ENSG00000215644",
    "CHEMBL4297774" = "ENSG00000146648",
    "CHEMBL4297774" = "ENSG00000105976",
    "CHEMBL4298185" = "ENSG00000112964", # chembl missing in platform
    "CHEMBL4650319" = "ENSG00000146648",
    "CHEMBL1863514" = "ENSG00000166183",
    "CHEMBL4594320" = "ENSG00000171298"
 )
 new_moas <- data.frame(
    chemblIds = names(ammend_moas),
    targetId = unlist(ammend_moas)
 )
 new_moas <- sdf_copy_to(sc, new_moas, overwrite = TRUE)

 # available MoAs + ammended
 moa <- spark_read_parquet(sc, moa_path, memory = FALSE) %>%
    select(chemblIds, targets) %>%
    sdf_explode(chemblIds) %>%
    sdf_explode(targets) %>%
    rename(targetId = targets) %>%
    sdf_distinct() %>%
    sdf_bind_rows(new_moas)

 # Platform ssociations indirect (by datasource)
 ass_indirectby_ds <- spark_read_parquet(sc, ass_indirectby_ds_path)

 # Joining associations information
 ass <- approvals %>%
    rename(diseaseId = DiseaseId) %>%
    left_join(moa, by = c("DrugId" = "chemblIds")) %>%
    left_join(ass_indirectby_ds, by = c("diseaseId", "targetId")) %>%
    collect()

 # Data about molecular interactions
 interactions <- spark_read_parquet(sc, interaction_path, memory = FALSE) %>%
    filter(sourceDatabase == "intact") %>%
    filter(!is.na(targetA)) %>%
    filter(!is.na(targetB)) %>%
    filter(scoring > 0.42) %>%
    select(targetA, targetB) %>%
    sdf_distinct()

 interactors_ass <- approvals %>%
    rename(diseaseId = DiseaseId) %>%
    inner_join(moa, by = c("DrugId" = "chemblIds")) %>%
    inner_join(interactions, by = c("targetId" = "targetA")) %>%
    inner_join(
        ass_indirectby_ds,
        by = c("diseaseId" = "diseaseId", "targetB" = "targetId")
    ) %>%
    select(datasourceId, Drug_brand_name) %>%
    sdf_distinct() %>%
    collect() %>%
    mutate(interactionAssociation = TRUE)

 # Additional phenotype curation
 ammend_phenotypes <- list(
    # Microalbuminuria (biomarker of CKD)
    "EFO_0000401" = "HP_0012594",
    # glycodeoxycholate sulfate (one of the bile acids that cause pruritus)
    "Orphanet_172" = "EFO_0005653",
    "Orphanet_52" = "EFO_0005653",
    # achondroplasia -> body height
    "Orphanet_15" = "EFO_0004339",
    "Orphanet_15" = "Orphanet_329191",
    #von hippel lindau -> renal carcinoma
    "Orphanet_892" = "EFO_0000681",
    "EFO_0001360" = "MONDO_0018582",
    # growth delay -> height
    "HP_0001510" = "EFO_0004339",
    #CAD -> myocardial infarctation
    "EFO_0001645" = "EFO_0000612"
 )
 new_phenotypes <- data.frame(
    diseaseId = names(ammend_phenotypes),
    phenotype = unlist(ammend_phenotypes)
 )
 new_phenotypes <- sdf_copy_to(sc, new_phenotypes, overwrite = TRUE)

 # Platform disease to phenotype data
 disease2phenotype <- spark_read_parquet(
    sc,
    disease2phenotype_path,
    memory = FALSE
 ) %>%
    select(diseaseId = disease, phenotype) %>%
    sdf_distinct()

 # Associations through indirect phenotypes
 phenotype_ass <- approvals %>%
    rename(diseaseId = DiseaseId) %>%
    inner_join(moa, by = c("DrugId" = "chemblIds")) %>%
    inner_join(
        disease2phenotype %>%
        sdf_bind_rows(new_phenotypes),
        by = c("diseaseId")) %>%
    inner_join(
        ass_indirectby_ds,
        by = c("phenotype" = "diseaseId", "targetId")) %>%
    select(datasourceId, Drug_brand_name) %>%
    sdf_distinct() %>%
    collect() %>%
    mutate(phenotypeAssociation = TRUE)

 # Data to plot
 data2plot <- ass %>%
    select(datasourceId, Drug_brand_name, score) %>%
    complete(datasourceId, Drug_brand_name) %>%
    mutate(score = replace_na(score, 0)) %>%
    filter(!is.na(datasourceId)) %>%
    # TA
    left_join(
        ass %>%
            select(
                Drug_brand_name,
                TA
            ) %>%
            distinct(),
        by = "Drug_brand_name"
    ) %>%
    # targets
    left_join(
        ass %>%
            mutate(noTarget = is.na(targetId)) %>%
            select(
                Drug_brand_name,
                noTarget
            ) %>%
            distinct(),
        by = "Drug_brand_name"
    ) %>%
    # interactions
    left_join(
        interactors_ass,
        by = c("datasourceId", "Drug_brand_name")
    ) %>%
    # related phenotypes
    left_join(
        phenotype_ass,
        by = c("datasourceId", "Drug_brand_name")
    ) %>%
    mutate(
        interactionAssociation = ifelse(score > 0, TRUE, interactionAssociation)
    ) %>%
    mutate(
        phenotypeAssociation = ifelse(score > 0, TRUE, phenotypeAssociation)
    ) %>%
    mutate(score = ifelse(noTarget, NA, score)) %>%
    mutate(TA = ifelse(noTarget, "No human target", TA)) %>%
    mutate(
        TA = fct_other(
            TA,
            keep = c("Oncology", "No human target"),
            other_level = "Other indication"
        )
    ) %>%
    mutate(
        TA = fct_relevel(TA, c(
            "Oncology",
            "Other indication",
            "No human target"
        ))
    ) %>%
    # mutate(datasourceId = fct_relevel(datasourceId, names(ds_name_list))) %>%
    filter(!(datasourceId %in% c("chembl", "expression_atlas", "sysbio", "europepmc", "phenodigm", "reactome", "phewas_catalog"))) %>%
    #drug score for the purpose of reordering them
    mutate(rankscore = replace_na(score, 0)) %>%
    mutate(rankscore = ifelse(!is.na(interactionAssociation), rankscore + 0.01, rankscore)) %>%
    mutate(rankscore = ifelse(!is.na(phenotypeAssociation), rankscore + 0.03, rankscore)) %>%
    mutate(Drug_brand_name = fct_rev(fct_reorder(
        Drug_brand_name, rankscore, mean,
        na.rm = TRUE, .desc = TRUE
    ))) %>%
    group_by(
        datasourceId,
        Drug_brand_name,
        TA,
        noTarget,
        interactionAssociation,
        phenotypeAssociation
    ) %>%
    summarise(score = suppressWarnings(max(score, na.rm = TRUE))) %>%
    mutate(score = ifelse(score < 0, NA, score)) %>%
    left_join(ds_names, by = "datasourceId") %>%
    mutate(
        datasourceName = factor(datasourceName, levels = ds_names$datasourceName),
        datasourceType = factor(datasourceType, levels = c("Somatic", "Functional genomics (cancer)", "Rare mendelian", "Common disease"))
    )

 # Values per data source
 briefplotdata <- data2plot %>%
 mutate(score = replace_na(score, 0)) %>%
 group_by(Drug_brand_name, TA, datasourceType) %>%
 summarise(
    noTarget = any(noTarget),
    interactionAssociation = any(interactionAssociation),
    phenotypeAssociation = any(phenotypeAssociation),
    score = ifelse(max(score, na.rm = TRUE) > 0, TRUE, FALSE)
 ) %>%
 mutate(noTarget = replace_na(noTarget, FALSE)) %>%
 mutate(phenotypeAssociation = replace_na(phenotypeAssociation, FALSE)) %>%
 mutate(phenotypeAssociation = ifelse(score, FALSE, phenotypeAssociation)) %>%
 mutate(interactionAssociation = replace_na(interactionAssociation, FALSE)) %>%
 mutate(interactionAssociation = ifelse(score, FALSE, interactionAssociation)) %>%
 mutate(interactionAssociation = ifelse(phenotypeAssociation, FALSE, interactionAssociation)) %>%
 mutate(noEvidence = !(interactionAssociation | phenotypeAssociation | score | noTarget)) %>%
 gather("evidence", "value", -Drug_brand_name, -TA, -datasourceType) %>%
 filter(value)

 # Values any data source
 briefplotdataAny <- data2plot %>%
 mutate(score = replace_na(score, 0)) %>%
 group_by(Drug_brand_name, TA) %>%
 summarise(
    noTarget = any(noTarget),
    interactionAssociation = any(interactionAssociation),
    phenotypeAssociation = any(phenotypeAssociation),
    score = ifelse(max(score, na.rm = TRUE) > 0, TRUE, FALSE)
 ) %>%
 mutate(datasourceType = "Any") %>%
 mutate(noTarget = replace_na(noTarget, FALSE)) %>%
 mutate(phenotypeAssociation = replace_na(phenotypeAssociation, FALSE)) %>%
 mutate(phenotypeAssociation = ifelse(score, FALSE, phenotypeAssociation)) %>%
 mutate(interactionAssociation = replace_na(interactionAssociation, FALSE)) %>%
 mutate(interactionAssociation = ifelse(score, FALSE, interactionAssociation)) %>%
 mutate(interactionAssociation = ifelse(phenotypeAssociation, FALSE, interactionAssociation)) %>%
 mutate(noEvidence = !(interactionAssociation | phenotypeAssociation | score | noTarget)) %>%
 gather("evidence", "value", -Drug_brand_name, -TA, -datasourceType) %>%
 filter(value)


 output <- bind_rows(briefplotdataAny, briefplotdata) %>%
 mutate(datasourceType = fct_relevel(datasourceType, levels = c("Any", "Somatic", "Functional genomics (cancer)", "Rare mendelian", "Common disease"))) %>%
 mutate(evidence = fct_relevel(evidence,
    "score",
    "phenotypeAssociation",
    "interactionAssociation",
    "noTarget",
    "noEvidence")) %>%
 mutate(evidence = fct_recode(evidence,
    "Direct" = "score",
    "Close phenotype" = "phenotypeAssociation",
    "Interacting protein" = "interactionAssociation",
    "No human target" = "noTarget",
    "Not available" = "noEvidence"
 )) %>%
 arrange(TA, desc(evidence)) %>%
 group_by(datasourceType) %>% 
 mutate(rn = row_number()) %>%
 mutate(evidence = replace(evidence, evidence == "Not available", NA)) %>%
 ggplot(aes(x = rn, y = fct_rev(datasourceType), fill = fct_rev(evidence))) +
 geom_tile(color = "white", height = .8, size = 0.5) +
 facet_grid(
    ~TA,
    scales = "free",
    space = "free"
 ) +
 # scale_fill_npg(name = "Genetic support", na.value = "grey90") +
 scale_fill_manual(
    name = "Genetic support",
    values = c("#3C5488FF", "#00A087FF", "#4DBBD5FF", "grey60"),
    breaks = c("Direct", "Close phenotype", "Interacting protein", "No human target"),
    na.value = "grey90") +
 scale_y_discrete(name = "Genetic data source", labels = function(x) str_wrap(x, width = 12)) +
 theme_cowplot(font_size = 11) +
 theme(
    plot.background = element_rect(fill = "white"),
    strip.background = element_blank(),
    axis.ticks = element_blank(),
    legend.position = "bottom",
    axis.text.x = element_blank(),
    axis.title.x = element_blank(),
    axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0)),
    axis.line = element_blank(),
    text = element_text(family = "sans"),
    panel.spacing = unit(-0.5, "lines")
 )

 ggsave(
    "/home/ochoa/2021_approvals_brief.pdf",
    plot = output,
    width = 6.5,
    height = 3.5,
    dpi = 400,
 )
diff --git a/datasourceMetadata.csv b/datasourceMetadata.csv
diff --git a/export_data.r b/export_data.r

 
 directSources <- ass %>%
    filter(!(datasourceId %in% c("chembl", "expression_atlas", "sysbio", "europepmc", "phenodigm", "reactome", "phewas_catalog"))) %>%
    mutate(datasourceId = datasourceId %>% str_replace("eva", "clinvar")) %>%
    filter(!is.na(datasourceId)) %>%
    group_by(Drug_brand_name) %>%
    summarise(directSources = paste(unique(datasourceId), collapse = ";"))

 summaryResults <- output %>% 
    filter(datasourceType == "Any") %>%
    select(Drug_brand_name, evidence)

 closePhenotypes <- phenotype_ass %>% 
    select(Drug_brand_name, datasourceId, phenotype) %>%
    left_join(
        spark_read_parquet(sc, disease_path) %>% 
        select(phenotype = id, phenotypeName = name),
        by = "phenotype") %>%
    collect() %>% 
    mutate(datasourceId = datasourceId %>% str_replace("eva", "clinvar")) %>%
    filter(!(datasourceId %in% c("chembl", "expression_atlas", "sysbio", "europepmc", "phenodigm", "reactome", "phewas_catalog"))) %>%
    distinct() %>% 
    group_by(Drug_brand_name) %>%
    summarise(
        closePhenotypeIds = paste(unique(phenotype), collapse = ";"),
        closePhenotypeNames = paste(unique(phenotypeName), collapse = ";"),
        closePhenotypeDataSources = paste(unique(datasourceId), collapse = ";")
    )

 target_path <- paste(
    gs_path, data_release,
    "/output/etl/parquet/target/",
    sep = ""
 )

 intDf <- approvals %>%
    rename(diseaseId = DiseaseId) %>%
    inner_join(moa, by = c("DrugId" = "chemblIds")) %>%
    inner_join(interactions, by = c("targetId" = "targetA")) %>%
    inner_join(
        ass_indirectby_ds,
        by = c("diseaseId" = "diseaseId", "targetB" = "targetId")
    ) %>%
    left_join(
        spark_read_parquet(sc, target_path) %>% 
        select(targetB = id, approvedSymbol),
        by = "targetB"
    ) %>%
    select(Drug_brand_name, targetB, datasourceId, approvedSymbol) %>%
    collect() %>%
    mutate(datasourceId = datasourceId %>% str_replace("eva", "clinvar")) %>%
    filter(!(datasourceId %in% c("chembl", "expression_atlas", "sysbio", "europepmc", "phenodigm", "reactome", "phewas_catalog"))) %>%
    distinct() %>% 
    group_by(Drug_brand_name) %>%
    summarise(
        interactingIds = paste(unique(targetB), collapse = ";"),
        interactingSymbols = paste(unique(approvedSymbol), collapse = ";"),
        interactingDataSources = paste(unique(datasourceId), collapse = ";")
    )

 out <- ass %>% 
    group_by(Drug_brand_name, Sponsor, DrugId, Indication, diseaseId, Properties) %>% 
    summarise(targetIds = paste(targetId, collapse = ";")) %>%
    left_join(summaryResults, by = "Drug_brand_name") %>%
    left_join(directSources, by = "Drug_brand_name") %>%
    left_join(closePhenotypes, by = "Drug_brand_name") %>%
    left_join(intDf, by = "Drug_brand_name")

 out %>% write_csv("/home/ochoa/2021_approvals_output.csv")
Drug (brand name)	Sponsor	Properties	Indication	DrugId	DiseaseId	TA	Manual disease mapping	ChemblCheck
Vericiguat (Verquvo)	Merck & Co./Bayer	sGC stimulator	Chronic heart failure	CHEMBL4066936	EFO_0001645	Cardiovascular	fuzzy
Cabotegravir; rilpivirine (Cabenuva Kit)	ViiV	INSTI and an NNRTI	HIV-1 infection	CHEMBL2403238	EFO_0000180	Infectious	exact
Voclosporin (Lupkynis)	Aurinia	Calcineurin inhibitor	Lupus nephritis	CHEMBL2218919	EFO_0002690	Nephrology	exact
Tepotinib (Tepmetko)	EMD Serono	MET kinase inhibitor	NSCLC	CHEMBL3402762	EFO_0003060	Oncology	exact
Umbralisib (Ukoniq)	TG Therapeutics	PI3Kδ and CK1ε inhibitor	MZL, follicular lymphoma	CHEMBL3948730	EFO_1000630	Oncology	exact
Evinacumab (Evkeeza)	Regeneron	ANGPTL3-targeted mAb	HoFH	CHEMBL3545191	Orphanet_391665	Metabolic	exact
Trilaciclib (Cosela)	G1 Therapeutics	CDK4 and CDK6 kinase inhibitor	Chemotherapy-induced myelosuppression	CHEMBL3894860	EFO_0000702	Oncology	NA
Casimersen (Amondys 45)	Sarepta	Exon 45-skipping ASO	DMD	CHEMBL4297566	Orphanet_98896	Other	exact
Fosdenopterin (Nulibry)	BridgeBio	cPMP	MoCD type A	CHEMBL2338675	Orphanet_308386	Other	exact
Melphalan flufenamide (Pepaxto)	Oncopeptides	Peptide-conjugated alkylating drug	Multiple myeloma	CHEMBL4303060	EFO_0001378	Oncology	exact
Dexmethylphenidate; serdexmethylphenidate (Azstarys)	Commave Therapeutics	CNS stimulant	ADHD	CHEMBL827	EFO_0003888	Psychiatric	exact
Tivozanib (Fotivda)	Aveo	VEGFR kinase inhibitor	Renal cell carcinoma	CHEMBL1289494	EFO_0000681	Oncology	exact
Ponesimod (Ponvory)	J&J	S1P receptor modulator	Relapsing multiple sclerosis	CHEMBL1096146	EFO_0003885	Other	fuzzy
Dasiglucagon (Zegalogue)	Zealand Pharma	Glucagon receptor agonist	Severe hypoglycaemia	CHEMBL4297741	EFO_0001360	Metabolic	exact
Viloxazine (Qelbree)	Supernus	SNRI	ADHD	CHEMBL306700	EFO_0003888	Psychiatric	exact
Drospirenone; estetrol (Nextstellis)	Mayne Pharma	Spironolactone and oestrogen analogues	To prevent pregnancy	CHEMBL1509	NA	Reproductive	NA
Dostarlimab (Jemperli)	GlaxoSmithKline	PD1-targeted mAb	Endometrial cancer	CHEMBL4298124	MONDO_0011962	Oncology	exact
Loncastuximab tesirine (Zynlonta)	ADC Therapeutics	CD19-targeted ADC	B-cell lymphoma	CHEMBL4297778	EFO_0000403	Oncology	exact
Pegcetacoplan (Empaveli)	Apellis	Complement protein C3 inhibitor	PNH	CHEMBL4298211	Orphanet_447	Other	exact
Amivantamab (Rybrevant)	J&J	EGFR×METR bispecific antibody	EGFR exon 20-mutated NSCLC	CHEMBL4297774	EFO_0003060	Oncology	fuzzy
Piflufolastat F-18 (Pylarify)	Progenics	Radiolabelled PSMA imaging agent	Prostate cancer imaging	NA	NA	Imaging	NA
Infigratinib (Truseltiq)	BridgeBio	FGFR2 kinase inhibitor	FGFR2-mutated bile duct cancer	CHEMBL1852688	EFO_0005540	Oncology	fuzzy
Sotorasib (Lumakras)	Amgen	KRAS-G12C inhibitor	KRASG12C-mutated NSCLC	CHEMBL4535757	EFO_0003060	Oncology	fuzzy	TRUE
Olanzapine; samidorphan (Lybalvi)	Alkermes	Atypical antipsychotic and opioid antagonist	Schizophrenia and bipolar I disorder	CHEMBL715	EFO_0000692	Psychiatric	exact
Ibrexafungerp (Brexafemme)	Scynexis	Triterpenoid antifungal	Vulvovaginal candidiasis	CHEMBL4297513	EFO_0007543	Infectious	exact
Aducanumab (Aduhelm)	Biogen/Eisai	Amyloid-β-targeted mAb	Alzheimer’s disease	CHEMBL3039540	EFO_0000249	Other	exact
Asparaginase erwinia chrysanthemi (Rylaze)	Jazz	Recombinant asparagine-specific enzyme	ALL and LBL, in patients allergic to E. coli-derived products	CHEMBL1863514	EFO_0000220	Oncology	fuzzy
Finerenone (Kerendia)	Bayer	Non-steroidal MR antagonist	CKD with type 2 diabetes	CHEMBL2181927	EFO_0000401	Other	exact
Fexinidazole (Fexinidazole)	Sanofi/DNDi	Nitroimidazole antimicrobial	Sleeping sickness	CHEMBL1631694	DOID_10113	Infectious	exact
Belumosudil (Rezurock)	Kadmon	ROCK2 kinase inhibitor	Chronic GVHD	CHEMBL4594302	MONDO_0013730	Other	exact
Odevixibat (Bylvay)	Albireo	IBAT inhibitor	Pruritus in PFIC	CHEMBL4297588	Orphanet_172	Other	exact	TRUE
Anifrolumab (Saphnelo)	AstraZeneca	IFNAR-targeted mAb	SLE	CHEMBL2364653	EFO_0002690	Other	exact
Avalglucosidase alfa (Nexviazyme)	Sanofi	Recombinant α-glucosidase	Pompe disease	CHEMBL4594320	Orphanet_365	Other	fuzzy
Belzutifan (Welireg)	Merck & Co.	HIF-2α inhibitor	von Hippel-Lindau disease	CHEMBL4585668	Orphanet_892	Oncology	exact	TRUE
Difelikefalin (Korsuva)	Cara Therapeutics	κ-Opioid receptor agonist	Pruritus associated with CKD	CHEMBL3989915	EFO_0003884	Other	fuzzy
Lonapegsomatropin (Skytrofa)	Ascendis Pharma	PEGylated human growth hormone	Growth failure due to GHD	CHEMBL4298185	HP_0001510	Other	NA	TRUE
Mobocertinib (Exkivity)	Takeda	EGFR kinase inhibitor	EGFR exon 20-mutated NSCLC	CHEMBL4650319	EFO_0003060	Oncology	fuzzy
Tisotumab vedotin (Tivdak)	Seagen/Genmab	Tissue-factor-directed ADC	Cervical cancer	CHEMBL4297841	MONDO_0002974	Oncology	exact
Atogepant (Qulipta)	AbbVie	CGRP receptor antagonist	Episodic migraine	CHEMBL3991065	EFO_0003821	Other	fuzzy
Maralixibat (Livmarli)	Mirum	IBAT inhibitor	Pruritus in Alagille syndrome	CHEMBL363392	Orphanet_52	Other	fuzzy
Avacopan (Tavneos)	ChemoCentryx	Complement 5a receptor antagonist	ANCA-associated vasculitis	CHEMBL3989871	EFO_0004826	Cardiovascular	exact
Asciminib (Scemblix)	Novartis	ABL/BCR–ABL1 kinase inhibitor	Ph+ CML	CHEMBL4208229	EFO_0000339	Oncology	fuzzy
Ropeginterferon alfa-2b (Besremi)	Pharmaessentia	PEGylated interferon α-2b	Polycythaemia vera	CHEMBL4297819	EFO_0002429	Oncology	exact
Vosoritide (Voxzogo)	Biomarin	CNP analogue	Achondroplasia	CHEMBL3707276	Orphanet_15	Other	exact
Maribavir (Livtencity)	Takeda	CMV pUL97 kinase inhibitor	Post-transplant CMV infection	CHEMBL515408	EFO_0001062	Infectious	fuzzy
Pafolacianine (Cytalux)	On Target Labs	Fluorescent FR imaging agent	Ovarian cancer imaging	CHEMBL4297412	MONDO_0008170	Imaging	exact
Efgartigimod alfa (Vyvgart)	Argenx	FcRn-binding Fc fragment	Myasthenia gravis	CHEMBL4297551	EFO_0004991	Other	exact
Tezepelumab (Tezspire)	Astrazeneca/Amgen	TSLP-targeted mAb	Severe asthma	CHEMBL3707229	EFO_0000270	Respiratory	exact
Inclisiran (Leqvio)	Novartis/Alnylam	PCSK9-targeted siRNA	HeFH or ASCVD	CHEMBL3990033	MONDO_0021661	Cardiovascular	fuzzy
Tralokinumab (Adbry)	LEO Pharma	IL-13-targeted mAb	Atopic dermatitis	CHEMBL1743081	EFO_0000274	Dermatology	fuzzy
	library("tidyverse")
	library("sparklyr")
	library("sparklyr.nested")
	library("cowplot")
	library("ggsci")

	#Spark config
	config <- spark_config()

	# Allowing to GCP datasets access
	config$spark.hadoop.fs.gs.requester.pays.mode <- "AUTO" # nolint
	config$spark.hadoop.fs.gs.requester.pays.project.id <- "open-targets-eu-dev" # nolint

	# spark connect
	sc <- spark_connect(master = "yarn", config = config)

	# Approvals as reported in NRDD article
	gs_approvals <- "gs://ot-team/dochoa/2021_approvals.csv"
	approvals <- spark_read_csv(
	sc,
	path = gs_approvals,
	memory = FALSE
	)

	# Datasource metadata
	ds_names <- spark_read_csv(
	sc,
	path = "gs://ot-team/dochoa/datasourceMetadata.csv",
	memory = FALSE) %>%
	collect()

	# Read Platform data
	gs_path <- "gs://open-targets-data-releases/"
	data_release <- "21.11"
	all_evidence_path <- paste(
	gs_path, data_release,
	"/output/etl/parquet/evidence/",
	sep = ""
	)
	moa_path <- paste(
	gs_path, data_release,
	"/output/etl/parquet/mechanismOfAction/",
	sep = ""
	)
	ass_indirectby_ds_path <- paste(
	gs_path, data_release,
	"/output/etl/parquet/associationByDatasourceIndirect/",
	sep = ""
	)
	disease_path <- paste(
	gs_path, data_release,
	"/output/etl/parquet/diseases/",
	sep = ""
	)
	interaction_path <- paste(
	gs_path, data_release,
	"/output/etl/parquet/interaction/",
	sep = ""
	)
	disease2phenotype_path <- paste(
	gs_path, data_release,
	"/output/etl/parquet/diseaseToPhenotype/",
	sep = ""
	)

	# Mechanisms of action
	# Extra MoAs required to fill the gaps
	ammend_moas <- list(
	"CHEMBL4594302" = "ENSG00000134318",
	"CHEMBL4297741" = "ENSG00000215644",
	"CHEMBL4297774" = "ENSG00000146648",
	"CHEMBL4297774" = "ENSG00000105976",
	"CHEMBL4298185" = "ENSG00000112964", # chembl missing in platform
	"CHEMBL4650319" = "ENSG00000146648",
	"CHEMBL1863514" = "ENSG00000166183",
	"CHEMBL4594320" = "ENSG00000171298"
	)
	new_moas <- data.frame(
	chemblIds = names(ammend_moas),
	targetId = unlist(ammend_moas)
	)
	new_moas <- sdf_copy_to(sc, new_moas, overwrite = TRUE)

	# available MoAs + ammended
	moa <- spark_read_parquet(sc, moa_path, memory = FALSE) %>%
	select(chemblIds, targets) %>%
	sdf_explode(chemblIds) %>%
	sdf_explode(targets) %>%
	rename(targetId = targets) %>%
	sdf_distinct() %>%
	sdf_bind_rows(new_moas)

	# Platform ssociations indirect (by datasource)
	ass_indirectby_ds <- spark_read_parquet(sc, ass_indirectby_ds_path)

	# Joining associations information
	ass <- approvals %>%
	rename(diseaseId = DiseaseId) %>%
	left_join(moa, by = c("DrugId" = "chemblIds")) %>%
	left_join(ass_indirectby_ds, by = c("diseaseId", "targetId")) %>%
	collect()

	# Data about molecular interactions
	interactions <- spark_read_parquet(sc, interaction_path, memory = FALSE) %>%
	filter(sourceDatabase == "intact") %>%
	filter(!is.na(targetA)) %>%
	filter(!is.na(targetB)) %>%
	filter(scoring > 0.42) %>%
	select(targetA, targetB) %>%
	sdf_distinct()

	interactors_ass <- approvals %>%
	rename(diseaseId = DiseaseId) %>%
	inner_join(moa, by = c("DrugId" = "chemblIds")) %>%
	inner_join(interactions, by = c("targetId" = "targetA")) %>%
	inner_join(
	ass_indirectby_ds,
	by = c("diseaseId" = "diseaseId", "targetB" = "targetId")
	) %>%
	select(datasourceId, Drug_brand_name) %>%
	sdf_distinct() %>%
	collect() %>%
	mutate(interactionAssociation = TRUE)

	# Additional phenotype curation
	ammend_phenotypes <- list(
	# Microalbuminuria (biomarker of CKD)
	"EFO_0000401" = "HP_0012594",
	# glycodeoxycholate sulfate (one of the bile acids that cause pruritus)
	"Orphanet_172" = "EFO_0005653",
	"Orphanet_52" = "EFO_0005653",
	# achondroplasia -> body height
	"Orphanet_15" = "EFO_0004339",
	"Orphanet_15" = "Orphanet_329191",
	#von hippel lindau -> renal carcinoma
	"Orphanet_892" = "EFO_0000681",
	"EFO_0001360" = "MONDO_0018582",
	# growth delay -> height
	"HP_0001510" = "EFO_0004339",
	#CAD -> myocardial infarctation
	"EFO_0001645" = "EFO_0000612"
	)
	new_phenotypes <- data.frame(
	diseaseId = names(ammend_phenotypes),
	phenotype = unlist(ammend_phenotypes)
	)
	new_phenotypes <- sdf_copy_to(sc, new_phenotypes, overwrite = TRUE)

	# Platform disease to phenotype data
	disease2phenotype <- spark_read_parquet(
	sc,
	disease2phenotype_path,
	memory = FALSE
	) %>%
	select(diseaseId = disease, phenotype) %>%
	sdf_distinct()

	# Associations through indirect phenotypes
	phenotype_ass <- approvals %>%
	rename(diseaseId = DiseaseId) %>%
	inner_join(moa, by = c("DrugId" = "chemblIds")) %>%
	inner_join(
	disease2phenotype %>%
	sdf_bind_rows(new_phenotypes),
	by = c("diseaseId")) %>%
	inner_join(
	ass_indirectby_ds,
	by = c("phenotype" = "diseaseId", "targetId")) %>%
	select(datasourceId, Drug_brand_name) %>%
	sdf_distinct() %>%
	collect() %>%
	mutate(phenotypeAssociation = TRUE)

	# Data to plot
	data2plot <- ass %>%
	select(datasourceId, Drug_brand_name, score) %>%
	complete(datasourceId, Drug_brand_name) %>%
	mutate(score = replace_na(score, 0)) %>%
	filter(!is.na(datasourceId)) %>%
	# TA
	left_join(
	ass %>%
	select(
	Drug_brand_name,
	TA
	) %>%
	distinct(),
	by = "Drug_brand_name"
	) %>%
	# targets
	left_join(
	ass %>%
	mutate(noTarget = is.na(targetId)) %>%
	select(
	Drug_brand_name,
	noTarget
	) %>%
	distinct(),
	by = "Drug_brand_name"
	) %>%
	# interactions
	left_join(
	interactors_ass,
	by = c("datasourceId", "Drug_brand_name")
	) %>%
	# related phenotypes
	left_join(
	phenotype_ass,
	by = c("datasourceId", "Drug_brand_name")
	) %>%
	mutate(
	interactionAssociation = ifelse(score > 0, TRUE, interactionAssociation)
	) %>%
	mutate(
	phenotypeAssociation = ifelse(score > 0, TRUE, phenotypeAssociation)
	) %>%
	mutate(score = ifelse(noTarget, NA, score)) %>%
	mutate(TA = ifelse(noTarget, "No human target", TA)) %>%
	mutate(
	TA = fct_other(
	TA,
	keep = c("Oncology", "No human target"),
	other_level = "Other indication"
	)
	) %>%
	mutate(
	TA = fct_relevel(TA, c(
	"Oncology",
	"Other indication",
	"No human target"
	))
	) %>%
	# mutate(datasourceId = fct_relevel(datasourceId, names(ds_name_list))) %>%
	filter(!(datasourceId %in% c("chembl", "expression_atlas", "sysbio", "europepmc", "phenodigm", "reactome", "phewas_catalog"))) %>%
	#drug score for the purpose of reordering them
	mutate(rankscore = replace_na(score, 0)) %>%
	mutate(rankscore = ifelse(!is.na(interactionAssociation), rankscore + 0.01, rankscore)) %>%
	mutate(rankscore = ifelse(!is.na(phenotypeAssociation), rankscore + 0.03, rankscore)) %>%
	mutate(Drug_brand_name = fct_rev(fct_reorder(
	Drug_brand_name, rankscore, mean,
	na.rm = TRUE, .desc = TRUE
	))) %>%
	group_by(
	datasourceId,
	Drug_brand_name,
	TA,
	noTarget,
	interactionAssociation,
	phenotypeAssociation
	) %>%
	summarise(score = suppressWarnings(max(score, na.rm = TRUE))) %>%
	mutate(score = ifelse(score < 0, NA, score)) %>%
	left_join(ds_names, by = "datasourceId") %>%
	mutate(
	datasourceName = factor(datasourceName, levels = ds_names$datasourceName),
	datasourceType = factor(datasourceType, levels = c("Somatic", "Functional genomics (cancer)", "Rare mendelian", "Common disease"))
	)


	# symbols to overlay in the plot
	overlay_data <- data2plot %>%
	ungroup() %>%
	select(
	datasourceName,
	datasourceType,
	Drug_brand_name,
	TA,
	interactionAssociation,
	phenotypeAssociation
	) %>%
	gather("overlay", "value", -datasourceName, -datasourceType, -Drug_brand_name, -TA) %>%
	filter(!is.na(value)) %>%
	mutate(overlay = str_replace_all(overlay, "Association", "")) %>%
	mutate(overlaySize = ifelse(overlay == "phenotype", 3, 1)) %>%
	mutate(overlaySymbol = as.character(ifelse(overlay == "phenotype", 1, 16)))

	# plotting
	output <- data2plot %>%
	ggplot(aes(
	x = datasourceName,
	y = Drug_brand_name)) +
	geom_tile(aes(fill = score), color = "white") +
	geom_point(data = overlay_data,
	aes(shape = overlay, size = overlaySize)) +
	scale_fill_material("blue",
	na.value = "grey90",
	name = "Direct association"
	) +
	scale_shape_manual(
	breaks = c("phenotype", "interaction"),
	labels = c("Direct or related phenotype", "Direct or interacting protein"),
	values = c(1, 16),
	name = "Supported by:") +
	scale_size_identity() +
	facet_grid(TA ~ datasourceType, scales = "free", space = "free") +
	theme_cowplot(font_size = 12) +
	# labs(
	# title = "Supporting evidence on 2021 FDA drug approvals",
	# subtitle = "Target-Disease evidence from Open Targets"
	# # caption =
	# # "Source: Nat Reviews Drug Discovery 10.1038/d41573-022-00001-9"
	# ) +
	theme(
	plot.background = element_rect(fill = "white"),
	strip.background = element_blank(),
	legend.direction = "horizontal",
	legend.box = "vertical",
	legend.position = c(-0.7, -0.16),
	legend.justification = c(0, 0),
	axis.ticks = element_blank(),
	axis.text.x = element_text(angle = 45, hjust = 1),
	axis.title = element_blank(),
	axis.line = element_blank(),
	text = element_text(family = "sans")
	) +
	guides(
	fill = guide_colourbar(
	title.position = "top",
	title.hjust = 0.5,
	barwidth = 8,
	frame.colour = "black",
	ticks.colour = "black",
	order = 2
	),
	shape = guide_legend(
	title.position = "top",
	direction = "vertical",
	order = 1
	)
	)
	ggsave(
	"/home/ochoa/2021_approvals.pdf",
	plot = output,
	width = 9,
	height = 11
	)
datasourceId	datasourceName	datasourceType
cancer_gene_census	CGC (COSMIC)	Somatic
intogen	IntOgen	Somatic
cancer_biomarkers	Cancer Biomarkers (CGI)	Somatic
crispr	Project Score	Functional genomics (cancer)
slapenrich	SlapEnrich	Functional genomics (cancer)
progeny	Progeny	Functional genomics (cancer)
eva_somatic	ClinVar (Somatic)	Somatic
ot_genetics_portal	OT Genetics Portal	Common disease
phewas_catalog	Phewas Catalog	Common disease
eva	ClinVar	Rare mendelian
clingen	Clingen	Rare mendelian
genomics_england	GEL PanelApp	Rare mendelian
orphanet	Orphanet	Rare mendelian
gene2phenotype	gene2phenotype	Rare mendelian
uniprot_literature	Uniprot (gene-disease)	Rare mendelian
uniprot_variants	Uniprot (variants)	Rare mendelian
reactome	Reactome	Functional genomics (cancer)
phenodigm	Mouse model (phenodigm)	Mouse model
europepmc	Literature (EPMC)	Literature
expression_atlas	ExpressionAtlas (Diff expression)	Differential Expression
chembl	drugs	Drugs


	directSources <- ass %>%
	filter(!(datasourceId %in% c("chembl", "expression_atlas", "sysbio", "europepmc", "phenodigm", "reactome", "phewas_catalog"))) %>%
	mutate(datasourceId = datasourceId %>% str_replace("eva", "clinvar")) %>%
	filter(!is.na(datasourceId)) %>%
	group_by(Drug_brand_name) %>%
	summarise(directSources = paste(unique(datasourceId), collapse = ";"))

	summaryResults <- output %>%
	filter(datasourceType == "Any") %>%
	select(Drug_brand_name, evidence)

	closePhenotypes <- phenotype_ass %>%
	select(Drug_brand_name, datasourceId, phenotype) %>%
	left_join(
	spark_read_parquet(sc, disease_path) %>%
	select(phenotype = id, phenotypeName = name),
	by = "phenotype") %>%
	collect() %>%
	mutate(datasourceId = datasourceId %>% str_replace("eva", "clinvar")) %>%
	filter(!(datasourceId %in% c("chembl", "expression_atlas", "sysbio", "europepmc", "phenodigm", "reactome", "phewas_catalog"))) %>%
	distinct() %>%
	group_by(Drug_brand_name) %>%
	summarise(
	closePhenotypeIds = paste(unique(phenotype), collapse = ";"),
	closePhenotypeNames = paste(unique(phenotypeName), collapse = ";"),
	closePhenotypeDataSources = paste(unique(datasourceId), collapse = ";")
	)

	target_path <- paste(
	gs_path, data_release,
	"/output/etl/parquet/target/",
	sep = ""
	)

	intDf <- approvals %>%
	rename(diseaseId = DiseaseId) %>%
	inner_join(moa, by = c("DrugId" = "chemblIds")) %>%
	inner_join(interactions, by = c("targetId" = "targetA")) %>%
	inner_join(
	ass_indirectby_ds,
	by = c("diseaseId" = "diseaseId", "targetB" = "targetId")
	) %>%
	left_join(
	spark_read_parquet(sc, target_path) %>%
	select(targetB = id, approvedSymbol),
	by = "targetB"
	) %>%
	select(Drug_brand_name, targetB, datasourceId, approvedSymbol) %>%
	collect() %>%
	mutate(datasourceId = datasourceId %>% str_replace("eva", "clinvar")) %>%
	filter(!(datasourceId %in% c("chembl", "expression_atlas", "sysbio", "europepmc", "phenodigm", "reactome", "phewas_catalog"))) %>%
	distinct() %>%
	group_by(Drug_brand_name) %>%
	summarise(
	interactingIds = paste(unique(targetB), collapse = ";"),
	interactingSymbols = paste(unique(approvedSymbol), collapse = ";"),
	interactingDataSources = paste(unique(datasourceId), collapse = ";")
	)

	out <- ass %>%
	group_by(Drug_brand_name, Sponsor, DrugId, Indication, diseaseId, Properties) %>%
	summarise(targetIds = paste(targetId, collapse = ";")) %>%
	left_join(summaryResults, by = "Drug_brand_name") %>%
	left_join(directSources, by = "Drug_brand_name") %>%
	left_join(closePhenotypes, by = "Drug_brand_name") %>%
	left_join(intDf, by = "Drug_brand_name")

	out %>% write_csv("/home/ochoa/2021_approvals_output.csv")