dbolser-ebi · February 11, 2015 11:51
diff --git a/gistfile1.sql b/gistfile1.sql

 -- Try adding an index...
 #ALTER TABLE tmp_individual_genotype_single_bp
 #  ADD INDEX allele_1_idx (allele_1),
 #  ADD INDEX allele_2_idx (allele_2);

 # Query OK, 277545309 rows affected (43 min 29.78 sec)

 -- The above index seems to have no impact on query execution time
 -- below, as the individual_id_idx is picked instead of either of
 -- these allele indexes!



 -- Make a table with the allele_code IDs (and population_ids) instead
 -- of allele strings (and individual_ids).  Bizarrely adding the JOIN
 -- to individual_population actually speeds up the query by a factor
 -- of approximately 2.

 OPTIMIZE TABLE tmp_individual_genotype_single_bp;
 OPTIMIZE TABLE allele_code;

 DROP             TABLE IF EXISTS temp_individual_genotype;
 CREATE TEMPORARY TABLE           temp_individual_genotype(
  INDEX (variation_id, population_id),
  INDEX (variation_id, population_id, allele_code_id_1),
  INDEX (variation_id, population_id, allele_code_id_2),
  INDEX (variation_id, population_id, allele_code_id_g)
 )
 #EXPLAIN
 SELECT
  variation_id, population_id,
  one.allele_code_id    AS allele_code_id_1,
  two.allele_code_id    AS allele_code_id_2,
  CONCAT(
    one.allele_code_id, ':',
    two.allele_code_id) AS allele_code_id_g
 FROM
  tmp_individual_genotype_single_bp
 INNER JOIN
  individual_population    p
 USING
  (individual_id)
 INNER JOIN
  allele_code one ON allele_1 = one.allele
 INNER JOIN
  allele_code two ON allele_2 = two.allele
 #
 #LIMIT
 #  50000000
 ;

 # Query OK, 555,090,618 rows affected (3 hours 9 min 50.87 sec)
 # Records:  555,090,618  Duplicates: 0  Warnings: 0

 OPTIMIZE TABLE temp_individual_genotype;
 # 1 row in set (10 min 50.60 sec)


 -- TODO: Add the multiple BP data to the same table here!





 -- Now make tables for each allele, genotype, and the total...
 CREATE TABLE temp_individual_genotype_tt
  (PRIMARY KEY (variation_id, population_id))
 #EXPLAIN
 SELECT   variation_id, population_id,
         COUNT(*) AS N
 FROM     temp_individual_genotype
 GROUP BY variation_id, population_id;

 CREATE TABLE temp_individual_genotype_a1
  (PRIMARY KEY (variation_id, population_id, allele_code_id_1))
 #EXPLAIN
 SELECT   variation_id, population_id, allele_code_id_1,
         COUNT(*) AS N
 FROM     temp_individual_genotype
 GROUP BY variation_id, population_id, allele_code_id_1;

 CREATE TABLE temp_individual_genotype_a2
  (PRIMARY KEY (variation_id, population_id, allele_code_id_2))
 #EXPLAIN
 SELECT   variation_id, population_id, allele_code_id_2,
         COUNT(*) AS N
 FROM     temp_individual_genotype
 GROUP BY variation_id, population_id, allele_code_id_2;

 CREATE TABLE temp_individual_genotype_gg
  (PRIMARY KEY (variation_id, population_id, allele_code_id_g))
 #EXPLAIN
 SELECT   variation_id, population_id, allele_code_id_g,
         COUNT(*) AS N
 FROM     temp_individual_genotype
 GROUP BY variation_id, population_id, allele_code_id_g;

 OPTIMIZE TABLE temp_individual_genotype_tt;
 OPTIMIZE TABLE temp_individual_genotype_a1;
 OPTIMIZE TABLE temp_individual_genotype_a2;
 OPTIMIZE TABLE temp_individual_genotype_gg;



 -- Now do an annoying shuffle to combine a1 and a2... Adding N into
 -- the index here avoids a costly data look-up later. TODO: Should
 -- merge above steps with this one.
 DROP   TABLE IF EXISTS temp_individual_genotype_ax;
 CREATE TABLE           temp_individual_genotype_ax
 (INDEX pk_ish (variation_id, population_id, allele_code_id, N))
 SELECT
  variation_id, population_id, allele_code_id_1 AS allele_code_id, N
 FROM
  temp_individual_genotype_a1
 #  LIMIT
 #    6000000
 ;

 INSERT INTO            temp_individual_genotype_ax
 SELECT
  variation_id, population_id, allele_code_id_2 AS allele_code_id, N
 FROM
  temp_individual_genotype_a2
 #  LIMIT
 #    6000000
 ;

 OPTIMIZE TABLE temp_individual_genotype_ax;

 -- and finally...
 CREATE TABLE temp_individual_genotype_aa
  (PRIMARY KEY (variation_id, population_id, allele_code_id))
 #EXPLAIN
 SELECT 
  variation_id, population_id, allele_code_id,
  COUNT(*) AS X, SUM(N) AS N
 FROM
  temp_individual_genotype_ax
 GROUP BY
  variation_id, population_id, allele_code_id
 ;

 OPTIMIZE TABLE temp_individual_genotype_aa;





 -- MAKE COUNTS LIKE THIS

 -- Allele

 RENAME TABLE allele TO   allele_bk;
 CREATE TABLE allele LIKE allele_bk;

 INSERT INTO allele
  (variation_id, allele_code_id, population_id, count, frequency)
 SELECT
  variation_id, allele_code_id, population_id,
  a.N       AS count,
  a.N / t.N AS frequency
 FROM
  temp_individual_genotype_tt t
 INNER JOIN
  temp_individual_genotype_aa a
 USING
  (variation_id, population_id)
 #LIMIT
 #  03
 ;

 OPTIMIZE TABLE allele;



 -- Genotype

 -- Select population genotype in same format as allele_code_id_g above

 CREATE TABLE temp_genotype_code
  (PRIMARY KEY (allele_code_id_g)) AS
 SELECT
  genotype_code_id,
  CONCAT(
    one.allele_code_id, ':',
    two.allele_code_id) AS allele_code_id_g
 FROM
  genotype_code one
 INNER JOIN
  genotype_code two
 USING
  (genotype_code_id)
 WHERE
  one.haplotype_id = 1 AND
  two.haplotype_id = 2 
 ;

 OPTIMIZE TABLE temp_genotype_code;

 RENAME TABLE population_genotype TO   population_genotype_bk;
 CREATE TABLE population_genotype LIKE population_genotype_bk;

 INSERT INTO population_genotype
  (variation_id, genotype_code_id, population_id, frequency)
 SELECT
  variation_id, genotype_code_id, population_id,
  g.N / t.N AS frequency
 FROM
  temp_individual_genotype_tt t
 INNER JOIN
  temp_individual_genotype_gg g
 USING
  (variation_id, population_id)
 INNER JOIN
  temp_genotype_code
 USING
  (allele_code_id_g)
 #LIMIT
 #  03
 ;

 OPTIMIZE TABLE population_genotype;

	-- Try adding an index...
	#ALTER TABLE tmp_individual_genotype_single_bp
	# ADD INDEX allele_1_idx (allele_1),
	# ADD INDEX allele_2_idx (allele_2);

	# Query OK, 277545309 rows affected (43 min 29.78 sec)

	-- The above index seems to have no impact on query execution time
	-- below, as the individual_id_idx is picked instead of either of
	-- these allele indexes!



	-- Make a table with the allele_code IDs (and population_ids) instead
	-- of allele strings (and individual_ids). Bizarrely adding the JOIN
	-- to individual_population actually speeds up the query by a factor
	-- of approximately 2.

	OPTIMIZE TABLE tmp_individual_genotype_single_bp;
	OPTIMIZE TABLE allele_code;

	DROP TABLE IF EXISTS temp_individual_genotype;
	CREATE TEMPORARY TABLE temp_individual_genotype(
	INDEX (variation_id, population_id),
	INDEX (variation_id, population_id, allele_code_id_1),
	INDEX (variation_id, population_id, allele_code_id_2),
	INDEX (variation_id, population_id, allele_code_id_g)
	)
	#EXPLAIN
	SELECT
	variation_id, population_id,
	one.allele_code_id AS allele_code_id_1,
	two.allele_code_id AS allele_code_id_2,
	CONCAT(
	one.allele_code_id, ':',
	two.allele_code_id) AS allele_code_id_g
	FROM
	tmp_individual_genotype_single_bp
	INNER JOIN
	individual_population p
	USING
	(individual_id)
	INNER JOIN
	allele_code one ON allele_1 = one.allele
	INNER JOIN
	allele_code two ON allele_2 = two.allele
	#
	#LIMIT
	# 50000000
	;

	# Query OK, 555,090,618 rows affected (3 hours 9 min 50.87 sec)
	# Records: 555,090,618 Duplicates: 0 Warnings: 0

	OPTIMIZE TABLE temp_individual_genotype;
	# 1 row in set (10 min 50.60 sec)


	-- TODO: Add the multiple BP data to the same table here!





	-- Now make tables for each allele, genotype, and the total...
	CREATE TABLE temp_individual_genotype_tt
	(PRIMARY KEY (variation_id, population_id))
	#EXPLAIN
	SELECT variation_id, population_id,
	COUNT(*) AS N
	FROM temp_individual_genotype
	GROUP BY variation_id, population_id;

	CREATE TABLE temp_individual_genotype_a1
	(PRIMARY KEY (variation_id, population_id, allele_code_id_1))
	#EXPLAIN
	SELECT variation_id, population_id, allele_code_id_1,
	COUNT(*) AS N
	FROM temp_individual_genotype
	GROUP BY variation_id, population_id, allele_code_id_1;

	CREATE TABLE temp_individual_genotype_a2
	(PRIMARY KEY (variation_id, population_id, allele_code_id_2))
	#EXPLAIN
	SELECT variation_id, population_id, allele_code_id_2,
	COUNT(*) AS N
	FROM temp_individual_genotype
	GROUP BY variation_id, population_id, allele_code_id_2;

	CREATE TABLE temp_individual_genotype_gg
	(PRIMARY KEY (variation_id, population_id, allele_code_id_g))
	#EXPLAIN
	SELECT variation_id, population_id, allele_code_id_g,
	COUNT(*) AS N
	FROM temp_individual_genotype
	GROUP BY variation_id, population_id, allele_code_id_g;

	OPTIMIZE TABLE temp_individual_genotype_tt;
	OPTIMIZE TABLE temp_individual_genotype_a1;
	OPTIMIZE TABLE temp_individual_genotype_a2;
	OPTIMIZE TABLE temp_individual_genotype_gg;



	-- Now do an annoying shuffle to combine a1 and a2... Adding N into
	-- the index here avoids a costly data look-up later. TODO: Should
	-- merge above steps with this one.
	DROP TABLE IF EXISTS temp_individual_genotype_ax;
	CREATE TABLE temp_individual_genotype_ax
	(INDEX pk_ish (variation_id, population_id, allele_code_id, N))
	SELECT
	variation_id, population_id, allele_code_id_1 AS allele_code_id, N
	FROM
	temp_individual_genotype_a1
	# LIMIT
	# 6000000
	;

	INSERT INTO temp_individual_genotype_ax
	SELECT
	variation_id, population_id, allele_code_id_2 AS allele_code_id, N
	FROM
	temp_individual_genotype_a2
	# LIMIT
	# 6000000
	;

	OPTIMIZE TABLE temp_individual_genotype_ax;

	-- and finally...
	CREATE TABLE temp_individual_genotype_aa
	(PRIMARY KEY (variation_id, population_id, allele_code_id))
	#EXPLAIN
	SELECT
	variation_id, population_id, allele_code_id,
	COUNT(*) AS X, SUM(N) AS N
	FROM
	temp_individual_genotype_ax
	GROUP BY
	variation_id, population_id, allele_code_id
	;

	OPTIMIZE TABLE temp_individual_genotype_aa;





	-- MAKE COUNTS LIKE THIS

	-- Allele

	RENAME TABLE allele TO allele_bk;
	CREATE TABLE allele LIKE allele_bk;

	INSERT INTO allele
	(variation_id, allele_code_id, population_id, count, frequency)
	SELECT
	variation_id, allele_code_id, population_id,
	a.N AS count,
	a.N / t.N AS frequency
	FROM
	temp_individual_genotype_tt t
	INNER JOIN
	temp_individual_genotype_aa a
	USING
	(variation_id, population_id)
	#LIMIT
	# 03
	;

	OPTIMIZE TABLE allele;



	-- Genotype

	-- Select population genotype in same format as allele_code_id_g above

	CREATE TABLE temp_genotype_code
	(PRIMARY KEY (allele_code_id_g)) AS
	SELECT
	genotype_code_id,
	CONCAT(
	one.allele_code_id, ':',
	two.allele_code_id) AS allele_code_id_g
	FROM
	genotype_code one
	INNER JOIN
	genotype_code two
	USING
	(genotype_code_id)
	WHERE
	one.haplotype_id = 1 AND
	two.haplotype_id = 2
	;

	OPTIMIZE TABLE temp_genotype_code;

	RENAME TABLE population_genotype TO population_genotype_bk;
	CREATE TABLE population_genotype LIKE population_genotype_bk;

	INSERT INTO population_genotype
	(variation_id, genotype_code_id, population_id, frequency)
	SELECT
	variation_id, genotype_code_id, population_id,
	g.N / t.N AS frequency
	FROM
	temp_individual_genotype_tt t
	INNER JOIN
	temp_individual_genotype_gg g
	USING
	(variation_id, population_id)
	INNER JOIN
	temp_genotype_code
	USING
	(allele_code_id_g)
	#LIMIT
	# 03
	;

	OPTIMIZE TABLE population_genotype;