acanakoglu · November 22, 2020 19:12
diff --git a/all_flatten_view.sql b/all_flatten_view.sql
 -- DROP MATERIALIZED VIEW all_flatten_view;
 CREATE MATERIALIZED VIEW all_flatten_view 
 WITH (FILLFACTOR = 100) 
 AS 
 	SELECT sequence_id, accession_id, strain_name, is_reference, is_complete, strand, length, gc_percentage, n_percentage, sequencing_technology, assembly_method, coverage, sequencing_lab, submission_date, bioproject_id, database_source, taxon_id, taxon_name, species, host_taxon_name, collection_date, isolation_source, geo_group, country, region, gender, age, nucleotide_sequence, lineage, clade, host_taxon_id, originating_lab, genus, sub_family, family, equivalent_list, molecule_type, is_single_stranded, is_positive_stranded
 	FROM virus
 	NATURAL JOIN sequence
 	NATURAL JOIN host_sample
 	NATURAL JOIN host_specie
 	NATURAL JOIN experiment_type
 	NATURAL JOIN sequencing_project
 ;

 -- PRIMARY KEY --
 CREATE UNIQUE INDEX all_flatten_view__sequence_id__uidx ON all_flatten_view(sequence_id) WITH (FILLFACTOR=100);

 CLUSTER VERBOSE all_flatten_view USING all_flatten_view__sequence_id__uidx;



 CREATE INDEX all_flatten_view__accession_id_lower__idx ON all_flatten_view(LOWER(accession_id)) WITH (FILLFACTOR=100);

 CREATE INDEX all_flatten_view__strain_name_lower__idx ON all_flatten_view(LOWER(strain_name)) WITH (FILLFACTOR=100);

 CREATE INDEX all_flatten_view__is_reference__idx ON all_flatten_view(is_reference) WITH (FILLFACTOR=100);

 CREATE INDEX all_flatten_view__is_complete__idx ON all_flatten_view(is_complete) WITH (FILLFACTOR=100);

 CREATE INDEX all_flatten_view__strand_lower__idx ON all_flatten_view(LOWER(strand)) WITH (FILLFACTOR=100);

 CREATE INDEX all_flatten_view__length__idx ON all_flatten_view(length) WITH (FILLFACTOR=100);

 CREATE INDEX all_flatten_view__gc_percentage__idx ON all_flatten_view(gc_percentage) WITH (FILLFACTOR=100);

 CREATE INDEX all_flatten_view__n_percentage__idx ON all_flatten_view(n_percentage) WITH (FILLFACTOR=100);

 CREATE INDEX all_flatten_view__sequencing_technology_lower__idx ON all_flatten_view(LOWER(sequencing_technology)) WITH (FILLFACTOR=100);

 CREATE INDEX all_flatten_view__assembly_method_lower__idx ON all_flatten_view(LOWER(assembly_method)) WITH (FILLFACTOR=100);

 CREATE INDEX all_flatten_view__coverage__idx ON all_flatten_view(coverage) WITH (FILLFACTOR=100);

 CREATE INDEX all_flatten_view__sequencing_lab_lower__idx ON all_flatten_view(LOWER(sequencing_lab)) WITH (FILLFACTOR=100);

 CREATE INDEX all_flatten_view__submission_date__idx ON all_flatten_view(submission_date) WITH (FILLFACTOR=100);

 CREATE INDEX all_flatten_view__bioproject_id_lower__idx ON all_flatten_view(LOWER(bioproject_id)) WITH (FILLFACTOR=100);

 CREATE INDEX all_flatten_view__database_source_lower__idx ON all_flatten_view(LOWER(database_source)) WITH (FILLFACTOR=100);

 CREATE INDEX all_flatten_view__taxon_id__idx ON all_flatten_view(taxon_id) WITH (FILLFACTOR=100);

 CREATE INDEX all_flatten_view__taxon_name_lower__idx ON all_flatten_view(LOWER(taxon_name)) WITH (FILLFACTOR=100);

 CREATE INDEX all_flatten_view__species_lower__idx ON all_flatten_view(LOWER(species)) WITH (FILLFACTOR=100);

 CREATE INDEX all_flatten_view__host_taxon_name_lower__idx ON all_flatten_view(LOWER(host_taxon_name)) WITH (FILLFACTOR=100);

 CREATE INDEX all_flatten_view__collection_date__idx ON all_flatten_view(collection_date) WITH (FILLFACTOR=100);

 CREATE INDEX all_flatten_view__isolation_source_lower__idx ON all_flatten_view(LOWER(isolation_source)) WITH (FILLFACTOR=100);

 CREATE INDEX all_flatten_view__geo_group_lower__idx ON all_flatten_view(LOWER(geo_group)) WITH (FILLFACTOR=100);

 CREATE INDEX all_flatten_view__country_lower__idx ON all_flatten_view(LOWER(country)) WITH (FILLFACTOR=100);

 CREATE INDEX all_flatten_view__region_lower__idx ON all_flatten_view(LOWER(region)) WITH (FILLFACTOR=100);

 CREATE INDEX all_flatten_view__gender_lower__idx ON all_flatten_view(LOWER(gender)) WITH (FILLFACTOR=100);

 CREATE INDEX all_flatten_view__age__idx ON all_flatten_view(age) WITH (FILLFACTOR=100);


 ANALYZE VERBOSE all_flatten_view;





diff --git a/nucleotide_variant_annotated.sql b/nucleotide_variant_annotated.sql
 -- DROP MATERIALIZED VIEW nucleotide_variant_annotated;
 CREATE MATERIALIZED VIEW nucleotide_variant_annotated 
 WITH (FILLFACTOR = 100)
 AS
 SELECT DISTINCT
  nc.nucleotide_variant_id,
  -- ann.annotation_id, -- --> possibly have annotation_id, too. but for now, we an skip that.

  nc.sequence_id,
  nc.variant_type,
  nc.start_original,
  nc.sequence_original,
  nc.sequence_alternative,
  nc.variant_length,

    ann.feature_type AS n_feature_type,
    ann.gene_name AS n_gene_name,
    ann.product AS n_product

   FROM nucleotide_variant as nc
     LEFT JOIN annotation as ann  -- --> LEFT?
   ON nc.start_original >= ann.start
  AND nc.start_original <= ann.stop
  AND nc.sequence_id = ann.sequence_id
  WHERE variant_length <= 20  -----------------> ?remove LIMIT 20
  -- and nc.sequence_id < 10 and ann.sequence_id < 10 -- FOR TESTING
  ;



 -- JOIN KEY
 CREATE INDEX nucleotide_variant_annotated__sequence_id__idx ON nucleotide_variant_annotated(sequence_id) WITH (FILLFACTOR = 100);

 CLUSTER VERBOSE nucleotide_variant_annotated USING nucleotide_variant_annotated__sequence_id__idx;


 CREATE INDEX nucleotide_variant_annotated__nucleotide_variant_id__idx ON nucleotide_variant_annotated(nucleotide_variant_id) WITH (FILLFACTOR = 100);


 CREATE INDEX nucleotide_variant_annotated__variant_type_lower__idx ON nucleotide_variant_annotated(LOWER(variant_type)) WITH (FILLFACTOR = 100);


 CREATE INDEX nucleotide_variant_annotated__start_original__idx ON nucleotide_variant_annotated(start_original) WITH (FILLFACTOR = 100);



 CREATE INDEX nucleotide_variant_annotated__sequence_original_lower__idx ON nucleotide_variant_annotated(LOWER(sequence_original)) WITH (FILLFACTOR = 100);


 CREATE INDEX nucleotide_variant_annotated__sequence_alternative_lower__idx ON nucleotide_variant_annotated(LOWER(sequence_alternative)) WITH (FILLFACTOR = 100);


 CREATE INDEX nucleotide_variant_annotated__n_feature_type_lower__idx ON nucleotide_variant_annotated(LOWER(n_feature_type)) WITH (FILLFACTOR = 100);


 CREATE INDEX nucleotide_variant_annotated__n_gene_name_lower__idx ON nucleotide_variant_annotated(LOWER(n_gene_name)) WITH (FILLFACTOR = 100);


 CREATE INDEX nucleotide_variant_annotated__n_product__idx ON nucleotide_variant_annotated(LOWER(n_product)) WITH (FILLFACTOR = 100);



 ANALYZE VERBOSE nucleotide_variant_annotated;














diff --git a/nucleotide_variant_impact_annotated.sql b/nucleotide_variant_impact_annotated.sql
 WITH nucleotide_variant_impact AS (
 	SELECT
 		nc.nucleotide_variant_id,
 		nc.sequence_id,
 		nc.variant_type,
 		nc.start_original,
 		nc.sequence_original,
 		nc.sequence_alternative,
 		nc.variant_length,
 			-- should be distinct!!!!
 		array_agg(--DISTINCT
 			ARRAY[effect, putative_impact, impact_gene_name] ORDER BY effect, putative_impact, impact_gene_name) --FILTER (WHERE imp.nucleotide_variant_id is not null)  
 									as variant_impact_array
 	FROM nucleotide_variant as nc
 	LEFT JOIN variant_impact as imp ON nc.nucleotide_variant_id = imp.nucleotide_variant_id
 	--WHERE nc.sequence_id < 10 -- FOR TESTING
 	GROUP BY nc.nucleotide_variant_id
 )
 SELECT nc.*,
 	-- ann.annotation_id, -- --> possibly have annotation_id, too. but for now, we an skip that.
    ann.feature_type AS n_feature_type,
    ann.gene_name AS n_gene_name,
    ann.product AS n_product
 FROM nucleotide_variant_impact as nc
 LEFT JOIN annotation as ann  -- --> LEFT?
 				   ON nc.start_original >= ann.start
 				  AND nc.start_original <= ann.stop
 				  AND nc.sequence_id = ann.sequence_id
 WHERE variant_length <= 20  -----------------> ?remove LIMIT 20


 --second version

 SELECT
 	nc.nucleotide_variant_id,
 	nc.sequence_id,
 	nc.variant_type,
 	nc.start_original,
 	nc.sequence_original,
 	nc.sequence_alternative,
 	nc.variant_length,
 		-- should be distinct!!!!
 	array_agg(--DISTINCT
 		ARRAY[effect, putative_impact, impact_gene_name] ORDER BY effect, putative_impact, impact_gene_name) --FILTER (WHERE imp.nucleotide_variant_id is not null)  
 								as variant_impact_array,
 	--ann.annotation_id, -- --> possibly have annotation_id, too. but for now, we an skip that.
 	ann.feature_type AS n_feature_type,
 	ann.gene_name AS n_gene_name,
 	ann.product AS n_product


 FROM nucleotide_variant as nc
 LEFT JOIN variant_impact as imp ON nc.nucleotide_variant_id = imp.nucleotide_variant_id
 LEFT JOIN annotation as ann  -- --> LEFT?
 			   ON nc.start_original >= ann.start
 			  AND nc.start_original <= ann.stop
 			  AND nc.sequence_id = ann.sequence_id
 WHERE variant_length <= 20  -----------------> ?remove LIMIT 20
 	GROUP BY nc.nucleotide_variant_id, ann.annotation_id
	-- DROP MATERIALIZED VIEW all_flatten_view;
	CREATE MATERIALIZED VIEW all_flatten_view
	WITH (FILLFACTOR = 100)
	AS
	SELECT sequence_id, accession_id, strain_name, is_reference, is_complete, strand, length, gc_percentage, n_percentage, sequencing_technology, assembly_method, coverage, sequencing_lab, submission_date, bioproject_id, database_source, taxon_id, taxon_name, species, host_taxon_name, collection_date, isolation_source, geo_group, country, region, gender, age, nucleotide_sequence, lineage, clade, host_taxon_id, originating_lab, genus, sub_family, family, equivalent_list, molecule_type, is_single_stranded, is_positive_stranded
	FROM virus
	NATURAL JOIN sequence
	NATURAL JOIN host_sample
	NATURAL JOIN host_specie
	NATURAL JOIN experiment_type
	NATURAL JOIN sequencing_project
	;

	-- PRIMARY KEY --
	CREATE UNIQUE INDEX all_flatten_view__sequence_id__uidx ON all_flatten_view(sequence_id) WITH (FILLFACTOR=100);

	CLUSTER VERBOSE all_flatten_view USING all_flatten_view__sequence_id__uidx;



	CREATE INDEX all_flatten_view__accession_id_lower__idx ON all_flatten_view(LOWER(accession_id)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__strain_name_lower__idx ON all_flatten_view(LOWER(strain_name)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__is_reference__idx ON all_flatten_view(is_reference) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__is_complete__idx ON all_flatten_view(is_complete) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__strand_lower__idx ON all_flatten_view(LOWER(strand)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__length__idx ON all_flatten_view(length) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__gc_percentage__idx ON all_flatten_view(gc_percentage) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__n_percentage__idx ON all_flatten_view(n_percentage) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__sequencing_technology_lower__idx ON all_flatten_view(LOWER(sequencing_technology)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__assembly_method_lower__idx ON all_flatten_view(LOWER(assembly_method)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__coverage__idx ON all_flatten_view(coverage) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__sequencing_lab_lower__idx ON all_flatten_view(LOWER(sequencing_lab)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__submission_date__idx ON all_flatten_view(submission_date) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__bioproject_id_lower__idx ON all_flatten_view(LOWER(bioproject_id)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__database_source_lower__idx ON all_flatten_view(LOWER(database_source)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__taxon_id__idx ON all_flatten_view(taxon_id) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__taxon_name_lower__idx ON all_flatten_view(LOWER(taxon_name)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__species_lower__idx ON all_flatten_view(LOWER(species)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__host_taxon_name_lower__idx ON all_flatten_view(LOWER(host_taxon_name)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__collection_date__idx ON all_flatten_view(collection_date) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__isolation_source_lower__idx ON all_flatten_view(LOWER(isolation_source)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__geo_group_lower__idx ON all_flatten_view(LOWER(geo_group)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__country_lower__idx ON all_flatten_view(LOWER(country)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__region_lower__idx ON all_flatten_view(LOWER(region)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__gender_lower__idx ON all_flatten_view(LOWER(gender)) WITH (FILLFACTOR=100);

	CREATE INDEX all_flatten_view__age__idx ON all_flatten_view(age) WITH (FILLFACTOR=100);


	ANALYZE VERBOSE all_flatten_view;
	-- DROP MATERIALIZED VIEW nucleotide_variant_annotated;
	CREATE MATERIALIZED VIEW nucleotide_variant_annotated
	WITH (FILLFACTOR = 100)
	AS
	SELECT DISTINCT
	nc.nucleotide_variant_id,
	-- ann.annotation_id, -- --> possibly have annotation_id, too. but for now, we an skip that.

	nc.sequence_id,
	nc.variant_type,
	nc.start_original,
	nc.sequence_original,
	nc.sequence_alternative,
	nc.variant_length,

	ann.feature_type AS n_feature_type,
	ann.gene_name AS n_gene_name,
	ann.product AS n_product

	FROM nucleotide_variant as nc
	LEFT JOIN annotation as ann -- --> LEFT?
	ON nc.start_original >= ann.start
	AND nc.start_original <= ann.stop
	AND nc.sequence_id = ann.sequence_id
	WHERE variant_length <= 20 -----------------> ?remove LIMIT 20
	-- and nc.sequence_id < 10 and ann.sequence_id < 10 -- FOR TESTING
	;



	-- JOIN KEY
	CREATE INDEX nucleotide_variant_annotated__sequence_id__idx ON nucleotide_variant_annotated(sequence_id) WITH (FILLFACTOR = 100);

	CLUSTER VERBOSE nucleotide_variant_annotated USING nucleotide_variant_annotated__sequence_id__idx;


	CREATE INDEX nucleotide_variant_annotated__nucleotide_variant_id__idx ON nucleotide_variant_annotated(nucleotide_variant_id) WITH (FILLFACTOR = 100);


	CREATE INDEX nucleotide_variant_annotated__variant_type_lower__idx ON nucleotide_variant_annotated(LOWER(variant_type)) WITH (FILLFACTOR = 100);


	CREATE INDEX nucleotide_variant_annotated__start_original__idx ON nucleotide_variant_annotated(start_original) WITH (FILLFACTOR = 100);



	CREATE INDEX nucleotide_variant_annotated__sequence_original_lower__idx ON nucleotide_variant_annotated(LOWER(sequence_original)) WITH (FILLFACTOR = 100);


	CREATE INDEX nucleotide_variant_annotated__sequence_alternative_lower__idx ON nucleotide_variant_annotated(LOWER(sequence_alternative)) WITH (FILLFACTOR = 100);


	CREATE INDEX nucleotide_variant_annotated__n_feature_type_lower__idx ON nucleotide_variant_annotated(LOWER(n_feature_type)) WITH (FILLFACTOR = 100);


	CREATE INDEX nucleotide_variant_annotated__n_gene_name_lower__idx ON nucleotide_variant_annotated(LOWER(n_gene_name)) WITH (FILLFACTOR = 100);


	CREATE INDEX nucleotide_variant_annotated__n_product__idx ON nucleotide_variant_annotated(LOWER(n_product)) WITH (FILLFACTOR = 100);



	ANALYZE VERBOSE nucleotide_variant_annotated;
	WITH nucleotide_variant_impact AS (
	SELECT
	nc.nucleotide_variant_id,
	nc.sequence_id,
	nc.variant_type,
	nc.start_original,
	nc.sequence_original,
	nc.sequence_alternative,
	nc.variant_length,
	-- should be distinct!!!!
	array_agg(--DISTINCT
	ARRAY[effect, putative_impact, impact_gene_name] ORDER BY effect, putative_impact, impact_gene_name) --FILTER (WHERE imp.nucleotide_variant_id is not null)
	as variant_impact_array
	FROM nucleotide_variant as nc
	LEFT JOIN variant_impact as imp ON nc.nucleotide_variant_id = imp.nucleotide_variant_id
	--WHERE nc.sequence_id < 10 -- FOR TESTING
	GROUP BY nc.nucleotide_variant_id
	)
	SELECT nc.*,
	-- ann.annotation_id, -- --> possibly have annotation_id, too. but for now, we an skip that.
	ann.feature_type AS n_feature_type,
	ann.gene_name AS n_gene_name,
	ann.product AS n_product
	FROM nucleotide_variant_impact as nc
	LEFT JOIN annotation as ann -- --> LEFT?
	ON nc.start_original >= ann.start
	AND nc.start_original <= ann.stop
	AND nc.sequence_id = ann.sequence_id
	WHERE variant_length <= 20 -----------------> ?remove LIMIT 20


	--second version

	SELECT
	nc.nucleotide_variant_id,
	nc.sequence_id,
	nc.variant_type,
	nc.start_original,
	nc.sequence_original,
	nc.sequence_alternative,
	nc.variant_length,
	-- should be distinct!!!!
	array_agg(--DISTINCT
	ARRAY[effect, putative_impact, impact_gene_name] ORDER BY effect, putative_impact, impact_gene_name) --FILTER (WHERE imp.nucleotide_variant_id is not null)
	as variant_impact_array,
	--ann.annotation_id, -- --> possibly have annotation_id, too. but for now, we an skip that.
	ann.feature_type AS n_feature_type,
	ann.gene_name AS n_gene_name,
	ann.product AS n_product


	FROM nucleotide_variant as nc
	LEFT JOIN variant_impact as imp ON nc.nucleotide_variant_id = imp.nucleotide_variant_id
	LEFT JOIN annotation as ann -- --> LEFT?
	ON nc.start_original >= ann.start
	AND nc.start_original <= ann.stop
	AND nc.sequence_id = ann.sequence_id
	WHERE variant_length <= 20 -----------------> ?remove LIMIT 20
	GROUP BY nc.nucleotide_variant_id, ann.annotation_id