Last active
November 22, 2020 19:12
-
-
Save acanakoglu/ed9fa4585a8d3828c1fbcb405dd2cfb3 to your computer and use it in GitHub Desktop.
ViruSurf views
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- DROP MATERIALIZED VIEW all_flatten_view; | |
CREATE MATERIALIZED VIEW all_flatten_view | |
WITH (FILLFACTOR = 100) | |
AS | |
SELECT sequence_id, accession_id, strain_name, is_reference, is_complete, strand, length, gc_percentage, n_percentage, sequencing_technology, assembly_method, coverage, sequencing_lab, submission_date, bioproject_id, database_source, taxon_id, taxon_name, species, host_taxon_name, collection_date, isolation_source, geo_group, country, region, gender, age, nucleotide_sequence, lineage, clade, host_taxon_id, originating_lab, genus, sub_family, family, equivalent_list, molecule_type, is_single_stranded, is_positive_stranded | |
FROM virus | |
NATURAL JOIN sequence | |
NATURAL JOIN host_sample | |
NATURAL JOIN host_specie | |
NATURAL JOIN experiment_type | |
NATURAL JOIN sequencing_project | |
; | |
-- PRIMARY KEY -- | |
CREATE UNIQUE INDEX all_flatten_view__sequence_id__uidx ON all_flatten_view(sequence_id) WITH (FILLFACTOR=100); | |
CLUSTER VERBOSE all_flatten_view USING all_flatten_view__sequence_id__uidx; | |
CREATE INDEX all_flatten_view__accession_id_lower__idx ON all_flatten_view(LOWER(accession_id)) WITH (FILLFACTOR=100); | |
CREATE INDEX all_flatten_view__strain_name_lower__idx ON all_flatten_view(LOWER(strain_name)) WITH (FILLFACTOR=100); | |
CREATE INDEX all_flatten_view__is_reference__idx ON all_flatten_view(is_reference) WITH (FILLFACTOR=100); | |
CREATE INDEX all_flatten_view__is_complete__idx ON all_flatten_view(is_complete) WITH (FILLFACTOR=100); | |
CREATE INDEX all_flatten_view__strand_lower__idx ON all_flatten_view(LOWER(strand)) WITH (FILLFACTOR=100); | |
CREATE INDEX all_flatten_view__length__idx ON all_flatten_view(length) WITH (FILLFACTOR=100); | |
CREATE INDEX all_flatten_view__gc_percentage__idx ON all_flatten_view(gc_percentage) WITH (FILLFACTOR=100); | |
CREATE INDEX all_flatten_view__n_percentage__idx ON all_flatten_view(n_percentage) WITH (FILLFACTOR=100); | |
CREATE INDEX all_flatten_view__sequencing_technology_lower__idx ON all_flatten_view(LOWER(sequencing_technology)) WITH (FILLFACTOR=100); | |
CREATE INDEX all_flatten_view__assembly_method_lower__idx ON all_flatten_view(LOWER(assembly_method)) WITH (FILLFACTOR=100); | |
CREATE INDEX all_flatten_view__coverage__idx ON all_flatten_view(coverage) WITH (FILLFACTOR=100); | |
CREATE INDEX all_flatten_view__sequencing_lab_lower__idx ON all_flatten_view(LOWER(sequencing_lab)) WITH (FILLFACTOR=100); | |
CREATE INDEX all_flatten_view__submission_date__idx ON all_flatten_view(submission_date) WITH (FILLFACTOR=100); | |
CREATE INDEX all_flatten_view__bioproject_id_lower__idx ON all_flatten_view(LOWER(bioproject_id)) WITH (FILLFACTOR=100); | |
CREATE INDEX all_flatten_view__database_source_lower__idx ON all_flatten_view(LOWER(database_source)) WITH (FILLFACTOR=100); | |
CREATE INDEX all_flatten_view__taxon_id__idx ON all_flatten_view(taxon_id) WITH (FILLFACTOR=100); | |
CREATE INDEX all_flatten_view__taxon_name_lower__idx ON all_flatten_view(LOWER(taxon_name)) WITH (FILLFACTOR=100); | |
CREATE INDEX all_flatten_view__species_lower__idx ON all_flatten_view(LOWER(species)) WITH (FILLFACTOR=100); | |
CREATE INDEX all_flatten_view__host_taxon_name_lower__idx ON all_flatten_view(LOWER(host_taxon_name)) WITH (FILLFACTOR=100); | |
CREATE INDEX all_flatten_view__collection_date__idx ON all_flatten_view(collection_date) WITH (FILLFACTOR=100); | |
CREATE INDEX all_flatten_view__isolation_source_lower__idx ON all_flatten_view(LOWER(isolation_source)) WITH (FILLFACTOR=100); | |
CREATE INDEX all_flatten_view__geo_group_lower__idx ON all_flatten_view(LOWER(geo_group)) WITH (FILLFACTOR=100); | |
CREATE INDEX all_flatten_view__country_lower__idx ON all_flatten_view(LOWER(country)) WITH (FILLFACTOR=100); | |
CREATE INDEX all_flatten_view__region_lower__idx ON all_flatten_view(LOWER(region)) WITH (FILLFACTOR=100); | |
CREATE INDEX all_flatten_view__gender_lower__idx ON all_flatten_view(LOWER(gender)) WITH (FILLFACTOR=100); | |
CREATE INDEX all_flatten_view__age__idx ON all_flatten_view(age) WITH (FILLFACTOR=100); | |
ANALYZE VERBOSE all_flatten_view; | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- DROP MATERIALIZED VIEW nucleotide_variant_annotated; | |
CREATE MATERIALIZED VIEW nucleotide_variant_annotated | |
WITH (FILLFACTOR = 100) | |
AS | |
SELECT DISTINCT | |
nc.nucleotide_variant_id, | |
-- ann.annotation_id, -- --> possibly have annotation_id, too. but for now, we an skip that. | |
nc.sequence_id, | |
nc.variant_type, | |
nc.start_original, | |
nc.sequence_original, | |
nc.sequence_alternative, | |
nc.variant_length, | |
ann.feature_type AS n_feature_type, | |
ann.gene_name AS n_gene_name, | |
ann.product AS n_product | |
FROM nucleotide_variant as nc | |
LEFT JOIN annotation as ann -- --> LEFT? | |
ON nc.start_original >= ann.start | |
AND nc.start_original <= ann.stop | |
AND nc.sequence_id = ann.sequence_id | |
WHERE variant_length <= 20 -----------------> ?remove LIMIT 20 | |
-- and nc.sequence_id < 10 and ann.sequence_id < 10 -- FOR TESTING | |
; | |
-- JOIN KEY | |
CREATE INDEX nucleotide_variant_annotated__sequence_id__idx ON nucleotide_variant_annotated(sequence_id) WITH (FILLFACTOR = 100); | |
CLUSTER VERBOSE nucleotide_variant_annotated USING nucleotide_variant_annotated__sequence_id__idx; | |
CREATE INDEX nucleotide_variant_annotated__nucleotide_variant_id__idx ON nucleotide_variant_annotated(nucleotide_variant_id) WITH (FILLFACTOR = 100); | |
CREATE INDEX nucleotide_variant_annotated__variant_type_lower__idx ON nucleotide_variant_annotated(LOWER(variant_type)) WITH (FILLFACTOR = 100); | |
CREATE INDEX nucleotide_variant_annotated__start_original__idx ON nucleotide_variant_annotated(start_original) WITH (FILLFACTOR = 100); | |
CREATE INDEX nucleotide_variant_annotated__sequence_original_lower__idx ON nucleotide_variant_annotated(LOWER(sequence_original)) WITH (FILLFACTOR = 100); | |
CREATE INDEX nucleotide_variant_annotated__sequence_alternative_lower__idx ON nucleotide_variant_annotated(LOWER(sequence_alternative)) WITH (FILLFACTOR = 100); | |
CREATE INDEX nucleotide_variant_annotated__n_feature_type_lower__idx ON nucleotide_variant_annotated(LOWER(n_feature_type)) WITH (FILLFACTOR = 100); | |
CREATE INDEX nucleotide_variant_annotated__n_gene_name_lower__idx ON nucleotide_variant_annotated(LOWER(n_gene_name)) WITH (FILLFACTOR = 100); | |
CREATE INDEX nucleotide_variant_annotated__n_product__idx ON nucleotide_variant_annotated(LOWER(n_product)) WITH (FILLFACTOR = 100); | |
ANALYZE VERBOSE nucleotide_variant_annotated; | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
WITH nucleotide_variant_impact AS ( | |
SELECT | |
nc.nucleotide_variant_id, | |
nc.sequence_id, | |
nc.variant_type, | |
nc.start_original, | |
nc.sequence_original, | |
nc.sequence_alternative, | |
nc.variant_length, | |
-- should be distinct!!!! | |
array_agg(--DISTINCT | |
ARRAY[effect, putative_impact, impact_gene_name] ORDER BY effect, putative_impact, impact_gene_name) --FILTER (WHERE imp.nucleotide_variant_id is not null) | |
as variant_impact_array | |
FROM nucleotide_variant as nc | |
LEFT JOIN variant_impact as imp ON nc.nucleotide_variant_id = imp.nucleotide_variant_id | |
--WHERE nc.sequence_id < 10 -- FOR TESTING | |
GROUP BY nc.nucleotide_variant_id | |
) | |
SELECT nc.*, | |
-- ann.annotation_id, -- --> possibly have annotation_id, too. but for now, we an skip that. | |
ann.feature_type AS n_feature_type, | |
ann.gene_name AS n_gene_name, | |
ann.product AS n_product | |
FROM nucleotide_variant_impact as nc | |
LEFT JOIN annotation as ann -- --> LEFT? | |
ON nc.start_original >= ann.start | |
AND nc.start_original <= ann.stop | |
AND nc.sequence_id = ann.sequence_id | |
WHERE variant_length <= 20 -----------------> ?remove LIMIT 20 | |
--second version | |
SELECT | |
nc.nucleotide_variant_id, | |
nc.sequence_id, | |
nc.variant_type, | |
nc.start_original, | |
nc.sequence_original, | |
nc.sequence_alternative, | |
nc.variant_length, | |
-- should be distinct!!!! | |
array_agg(--DISTINCT | |
ARRAY[effect, putative_impact, impact_gene_name] ORDER BY effect, putative_impact, impact_gene_name) --FILTER (WHERE imp.nucleotide_variant_id is not null) | |
as variant_impact_array, | |
--ann.annotation_id, -- --> possibly have annotation_id, too. but for now, we an skip that. | |
ann.feature_type AS n_feature_type, | |
ann.gene_name AS n_gene_name, | |
ann.product AS n_product | |
FROM nucleotide_variant as nc | |
LEFT JOIN variant_impact as imp ON nc.nucleotide_variant_id = imp.nucleotide_variant_id | |
LEFT JOIN annotation as ann -- --> LEFT? | |
ON nc.start_original >= ann.start | |
AND nc.start_original <= ann.stop | |
AND nc.sequence_id = ann.sequence_id | |
WHERE variant_length <= 20 -----------------> ?remove LIMIT 20 | |
GROUP BY nc.nucleotide_variant_id, ann.annotation_id |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment