Created
February 11, 2015 11:51
-
-
Save dbolser-ebi/e61f78968eb816610c0c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Try adding an index... | |
#ALTER TABLE tmp_individual_genotype_single_bp | |
# ADD INDEX allele_1_idx (allele_1), | |
# ADD INDEX allele_2_idx (allele_2); | |
# Query OK, 277545309 rows affected (43 min 29.78 sec) | |
-- The above index seems to have no impact on query execution time | |
-- below, as the individual_id_idx is picked instead of either of | |
-- these allele indexes! | |
-- Make a table with the allele_code IDs (and population_ids) instead | |
-- of allele strings (and individual_ids). Bizarrely adding the JOIN | |
-- to individual_population actually speeds up the query by a factor | |
-- of approximately 2. | |
OPTIMIZE TABLE tmp_individual_genotype_single_bp; | |
OPTIMIZE TABLE allele_code; | |
DROP TABLE IF EXISTS temp_individual_genotype; | |
CREATE TEMPORARY TABLE temp_individual_genotype( | |
INDEX (variation_id, population_id), | |
INDEX (variation_id, population_id, allele_code_id_1), | |
INDEX (variation_id, population_id, allele_code_id_2), | |
INDEX (variation_id, population_id, allele_code_id_g) | |
) | |
#EXPLAIN | |
SELECT | |
variation_id, population_id, | |
one.allele_code_id AS allele_code_id_1, | |
two.allele_code_id AS allele_code_id_2, | |
CONCAT( | |
one.allele_code_id, ':', | |
two.allele_code_id) AS allele_code_id_g | |
FROM | |
tmp_individual_genotype_single_bp | |
INNER JOIN | |
individual_population p | |
USING | |
(individual_id) | |
INNER JOIN | |
allele_code one ON allele_1 = one.allele | |
INNER JOIN | |
allele_code two ON allele_2 = two.allele | |
# | |
#LIMIT | |
# 50000000 | |
; | |
# Query OK, 555,090,618 rows affected (3 hours 9 min 50.87 sec) | |
# Records: 555,090,618 Duplicates: 0 Warnings: 0 | |
OPTIMIZE TABLE temp_individual_genotype; | |
# 1 row in set (10 min 50.60 sec) | |
-- TODO: Add the multiple BP data to the same table here! | |
-- Now make tables for each allele, genotype, and the total... | |
CREATE TABLE temp_individual_genotype_tt | |
(PRIMARY KEY (variation_id, population_id)) | |
#EXPLAIN | |
SELECT variation_id, population_id, | |
COUNT(*) AS N | |
FROM temp_individual_genotype | |
GROUP BY variation_id, population_id; | |
CREATE TABLE temp_individual_genotype_a1 | |
(PRIMARY KEY (variation_id, population_id, allele_code_id_1)) | |
#EXPLAIN | |
SELECT variation_id, population_id, allele_code_id_1, | |
COUNT(*) AS N | |
FROM temp_individual_genotype | |
GROUP BY variation_id, population_id, allele_code_id_1; | |
CREATE TABLE temp_individual_genotype_a2 | |
(PRIMARY KEY (variation_id, population_id, allele_code_id_2)) | |
#EXPLAIN | |
SELECT variation_id, population_id, allele_code_id_2, | |
COUNT(*) AS N | |
FROM temp_individual_genotype | |
GROUP BY variation_id, population_id, allele_code_id_2; | |
CREATE TABLE temp_individual_genotype_gg | |
(PRIMARY KEY (variation_id, population_id, allele_code_id_g)) | |
#EXPLAIN | |
SELECT variation_id, population_id, allele_code_id_g, | |
COUNT(*) AS N | |
FROM temp_individual_genotype | |
GROUP BY variation_id, population_id, allele_code_id_g; | |
OPTIMIZE TABLE temp_individual_genotype_tt; | |
OPTIMIZE TABLE temp_individual_genotype_a1; | |
OPTIMIZE TABLE temp_individual_genotype_a2; | |
OPTIMIZE TABLE temp_individual_genotype_gg; | |
-- Now do an annoying shuffle to combine a1 and a2... Adding N into | |
-- the index here avoids a costly data look-up later. TODO: Should | |
-- merge above steps with this one. | |
DROP TABLE IF EXISTS temp_individual_genotype_ax; | |
CREATE TABLE temp_individual_genotype_ax | |
(INDEX pk_ish (variation_id, population_id, allele_code_id, N)) | |
SELECT | |
variation_id, population_id, allele_code_id_1 AS allele_code_id, N | |
FROM | |
temp_individual_genotype_a1 | |
# LIMIT | |
# 6000000 | |
; | |
INSERT INTO temp_individual_genotype_ax | |
SELECT | |
variation_id, population_id, allele_code_id_2 AS allele_code_id, N | |
FROM | |
temp_individual_genotype_a2 | |
# LIMIT | |
# 6000000 | |
; | |
OPTIMIZE TABLE temp_individual_genotype_ax; | |
-- and finally... | |
CREATE TABLE temp_individual_genotype_aa | |
(PRIMARY KEY (variation_id, population_id, allele_code_id)) | |
#EXPLAIN | |
SELECT | |
variation_id, population_id, allele_code_id, | |
COUNT(*) AS X, SUM(N) AS N | |
FROM | |
temp_individual_genotype_ax | |
GROUP BY | |
variation_id, population_id, allele_code_id | |
; | |
OPTIMIZE TABLE temp_individual_genotype_aa; | |
-- MAKE COUNTS LIKE THIS | |
-- Allele | |
RENAME TABLE allele TO allele_bk; | |
CREATE TABLE allele LIKE allele_bk; | |
INSERT INTO allele | |
(variation_id, allele_code_id, population_id, count, frequency) | |
SELECT | |
variation_id, allele_code_id, population_id, | |
a.N AS count, | |
a.N / t.N AS frequency | |
FROM | |
temp_individual_genotype_tt t | |
INNER JOIN | |
temp_individual_genotype_aa a | |
USING | |
(variation_id, population_id) | |
#LIMIT | |
# 03 | |
; | |
OPTIMIZE TABLE allele; | |
-- Genotype | |
-- Select population genotype in same format as allele_code_id_g above | |
CREATE TABLE temp_genotype_code | |
(PRIMARY KEY (allele_code_id_g)) AS | |
SELECT | |
genotype_code_id, | |
CONCAT( | |
one.allele_code_id, ':', | |
two.allele_code_id) AS allele_code_id_g | |
FROM | |
genotype_code one | |
INNER JOIN | |
genotype_code two | |
USING | |
(genotype_code_id) | |
WHERE | |
one.haplotype_id = 1 AND | |
two.haplotype_id = 2 | |
; | |
OPTIMIZE TABLE temp_genotype_code; | |
RENAME TABLE population_genotype TO population_genotype_bk; | |
CREATE TABLE population_genotype LIKE population_genotype_bk; | |
INSERT INTO population_genotype | |
(variation_id, genotype_code_id, population_id, frequency) | |
SELECT | |
variation_id, genotype_code_id, population_id, | |
g.N / t.N AS frequency | |
FROM | |
temp_individual_genotype_tt t | |
INNER JOIN | |
temp_individual_genotype_gg g | |
USING | |
(variation_id, population_id) | |
INNER JOIN | |
temp_genotype_code | |
USING | |
(allele_code_id_g) | |
#LIMIT | |
# 03 | |
; | |
OPTIMIZE TABLE population_genotype; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment