Created
March 20, 2015 01:39
-
-
Save dbolser-ebi/b32cf4feb590c4e93545 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| [dbolser@ebi-001 VCFLoad_simple]$ cat calculate_frequency.sql | |
| -- Try adding an index... | |
| #ALTER TABLE tmp_individual_genotype_single_bp | |
| # ADD INDEX allele_1_idx (allele_1), | |
| # ADD INDEX allele_2_idx (allele_2); | |
| # Query OK, 277545309 rows affected (43 min 29.78 sec) | |
| -- The above index seems to have no impact on query execution time | |
| -- below, as the individual_id_idx is picked instead of either of | |
| -- these allele indexes! | |
| -- Make a table with the allele_code IDs (and population_ids) instead | |
| -- of allele strings (and individual_ids). Bizarrely adding the JOIN | |
| -- to individual_population actually speeds up the query by a factor | |
| -- of approximately 2. | |
| OPTIMIZE TABLE allele_code; | |
| OPTIMIZE TABLE tmp_individual_genotype_single_bp; | |
| OPTIMIZE TABLE individual_genotype_multiple_bp; | |
| DROP TABLE IF EXISTS temp_individual_genotype; | |
| CREATE TEMPORARY TABLE temp_individual_genotype( | |
| INDEX (variation_id, population_id), | |
| INDEX (variation_id, population_id, allele_code_id_1), | |
| INDEX (variation_id, population_id, allele_code_id_2), | |
| INDEX (variation_id, population_id, allele_code_id_g) | |
| ) | |
| #EXPLAIN | |
| SELECT | |
| variation_id, population_id, | |
| one.allele_code_id AS allele_code_id_1, | |
| two.allele_code_id AS allele_code_id_2, | |
| CONCAT( | |
| one.allele_code_id, ':', | |
| two.allele_code_id) AS allele_code_id_g | |
| FROM | |
| tmp_individual_genotype_single_bp | |
| INNER JOIN | |
| individual_population p | |
| USING | |
| (individual_id) | |
| INNER JOIN | |
| allele_code one ON allele_1 = one.allele | |
| INNER JOIN | |
| allele_code two ON allele_2 = two.allele | |
| # | |
| #LIMIT | |
| # 0 | |
| ; | |
| # Query OK, 555,090,618 rows affected (3 hours 9 min 50.87 sec) | |
| # Records: 555,090,618 Duplicates: 0 Warnings: 0 | |
| # Query OK, 832,635,927 rows affected (3 hours 34 min 39.96 sec) | |
| # Records: 832,635,927 Duplicates: 0 Warnings: 0 | |
| # Query OK, 832,635,927 rows affected (5 hours 46 min 49.44 sec) | |
| # Records: 832,635,927 Duplicates: 0 Warnings: 0 | |
| OPTIMIZE TABLE temp_individual_genotype; | |
| # 1 row in set (10 min 50.60 sec) | |
| # 1 row in set ( 7 min 19.21 sec) | |
| # 1 row in set ( 7 min 55.38 sec) | |
| -- Add the multiple BP data to the same table | |
| INSERT INTO | |
| temp_individual_genotype | |
| SELECT | |
| variation_id, population_id, | |
| one.allele_code_id AS allele_code_id_1, | |
| two.allele_code_id AS allele_code_id_2, | |
| CONCAT( | |
| one.allele_code_id, ':', | |
| two.allele_code_id) AS allele_code_id_g | |
| FROM | |
| individual_genotype_multiple_bp | |
| INNER JOIN | |
| individual_population p | |
| USING | |
| (individual_id) | |
| INNER JOIN | |
| allele_code one ON allele_1 = one.allele | |
| INNER JOIN | |
| allele_code two ON allele_2 = two.allele | |
| # | |
| #LIMIT | |
| # 50000 | |
| ; | |
| #Query OK, 94,259,724 rows affected (29 min 33.71 sec) | |
| #Records: 94,259,724 Duplicates: 0 Warnings: 0 | |
| OPTIMIZE TABLE temp_individual_genotype; | |
| #1 row in set (1 min 5.82 sec) | |
| -- Now make tables for each allele, genotype, and the total... | |
| -- ALLELE (in three steps) | |
| -- Allele 1 | |
| DROP TABLE IF EXISTS temp_individual_genotype_ax; | |
| CREATE TEMPORARY TABLE temp_individual_genotype_ax | |
| -- Adding N into the index here avoids a costly data look-up later. | |
| (INDEX pk_ish (variation_id, population_id, allele_code_id, N)) | |
| #EXPLAIN | |
| SELECT variation_id, population_id, | |
| allele_code_id_1 AS allele_code_id, | |
| COUNT(*) AS N | |
| FROM temp_individual_genotype | |
| GROUP BY variation_id, population_id, allele_code_id_1; | |
| #Query OK, 327889779 rows affected (29 min 32.86 sec) | |
| OPTIMIZE TABLE temp_individual_genotype_ax; | |
| -- Allele 2 | |
| INSERT INTO temp_individual_genotype_ax | |
| #EXPLAIN | |
| SELECT variation_id, population_id, | |
| allele_code_id_2 AS allele_code_id, | |
| COUNT(*) AS N | |
| FROM temp_individual_genotype | |
| GROUP BY variation_id, population_id, allele_code_id_2; | |
| #Query OK, 290316464 rows affected (25 min 17.58 sec) | |
| OPTIMIZE TABLE temp_individual_genotype_ax; | |
| -- and finally... | |
| DROP TABLE IF EXISTS temp_individual_genotype_aa; | |
| CREATE TEMPORARY TABLE temp_individual_genotype_aa | |
| (PRIMARY KEY (variation_id, population_id, allele_code_id)) | |
| #EXPLAIN | |
| SELECT | |
| variation_id, population_id, allele_code_id, | |
| COUNT(*) AS X, SUM(N) AS N | |
| FROM | |
| temp_individual_genotype_ax | |
| GROUP BY | |
| variation_id, population_id, allele_code_id | |
| ; | |
| #Query OK, 399187969 rows affected (32 min 38.95 sec) | |
| OPTIMIZE TABLE temp_individual_genotype_aa; | |
| -- GENOTYPE | |
| DROP TABLE IF EXISTS temp_individual_genotype_gg; | |
| CREATE TEMPORARY TABLE temp_individual_genotype_gg | |
| (PRIMARY KEY (variation_id, population_id, allele_code_id_g)) | |
| #EXPLAIN | |
| SELECT | |
| variation_id, population_id, allele_code_id_g, | |
| COUNT(*) AS N | |
| FROM | |
| temp_individual_genotype | |
| GROUP BY | |
| variation_id, population_id, allele_code_id_g | |
| ; | |
| #Query OK, 329562440 rows affected (1 hour 21 min 20.40 sec) | |
| OPTIMIZE TABLE temp_individual_genotype_gg; | |
| -- TOTAL | |
| DROP TABLE IF EXISTS temp_individual_genotype_tt; | |
| CREATE TEMPORARY TABLE temp_individual_genotype_tt | |
| (PRIMARY KEY (variation_id, population_id)) | |
| #EXPLAIN | |
| SELECT | |
| variation_id, population_id, | |
| COUNT(*) AS N | |
| FROM | |
| temp_individual_genotype | |
| GROUP BY | |
| variation_id, population_id | |
| ; | |
| #Query OK, 283148189 rows affected (22 min 14.18 sec) | |
| OPTIMIZE TABLE temp_individual_genotype_tt; | |
| -- MAKE COUNTS LIKE THIS | |
| -- Allele | |
| RENAME TABLE allele TO allele_bk3; | |
| CREATE TABLE allele LIKE allele_bk3; | |
| INSERT INTO allele | |
| (variation_id, allele_code_id, population_id, count, frequency) | |
| SELECT | |
| variation_id, allele_code_id, population_id, | |
| a.N AS count, | |
| -- We picked up a factor of two somewhere above | |
| a.N / t.N / 2 AS frequency | |
| FROM | |
| temp_individual_genotype_aa a | |
| INNER JOIN | |
| temp_individual_genotype_tt t | |
| USING | |
| (variation_id, population_id) | |
| #LIMIT | |
| # 03 | |
| ; | |
| #Query OK, 399187969 rows affected, 1 warning (1 hour 42 min 43.03 sec) | |
| OPTIMIZE TABLE allele; | |
| -- Genotype | |
| -- For convenience we need to select population genotype in same | |
| -- format as allele_code_id_g above | |
| DROP TABLE IF EXISTS temp_genotype_code; | |
| CREATE TEMPORARY TABLE temp_genotype_code | |
| (PRIMARY KEY (allele_code_id_g)) AS | |
| SELECT | |
| genotype_code_id, | |
| CONCAT( | |
| one.allele_code_id, ':', | |
| two.allele_code_id) AS allele_code_id_g | |
| FROM | |
| genotype_code one | |
| INNER JOIN | |
| genotype_code two | |
| USING | |
| (genotype_code_id) | |
| WHERE | |
| one.haplotype_id = 1 AND | |
| two.haplotype_id = 2 | |
| ; | |
| #Query OK, 1000424 rows affected (12.56 sec) | |
| #Query OK, 1000424 rows affected (14.21 sec) | |
| OPTIMIZE TABLE temp_genotype_code; | |
| #1 row in set (0.27 sec) | |
| RENAME TABLE population_genotype TO population_genotype_bk3; | |
| CREATE TABLE population_genotype LIKE population_genotype_bk3; | |
| INSERT INTO population_genotype | |
| (variation_id, genotype_code_id, population_id, frequency) | |
| SELECT | |
| variation_id, genotype_code_id, population_id, | |
| g.N / t.N AS frequency | |
| FROM | |
| temp_individual_genotype_gg g | |
| INNER JOIN | |
| temp_individual_genotype_tt t | |
| USING | |
| (variation_id, population_id) | |
| INNER JOIN | |
| temp_genotype_code | |
| USING | |
| (allele_code_id_g) | |
| #LIMIT | |
| # 03 | |
| ; | |
| #Query OK, 367,890,086 rows affected, 1 warning (2 hours 31 min 41.78 sec) | |
| #Records: 367,890,086 Duplicates: 0 Warnings: 1 | |
| OPTIMIZE TABLE population_genotype; | |
| #1 row in set (2 min 50.12 sec) | |
| -- PRAY! | |
| DROP TABLE IF EXISTS | |
| temp_individual_genotype_tt, | |
| temp_individual_genotype_gg, | |
| temp_individual_genotype_aa, | |
| temp_individual_genotype_ax, | |
| temp_individual_genotype, | |
| temp_genotype_code | |
| ; | |
| [dbolser@ebi-001 VCFLoad_simple]$ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment