Skip to content

Instantly share code, notes, and snippets.

@dbolser-ebi
Created March 20, 2015 01:39
Show Gist options
  • Select an option

  • Save dbolser-ebi/b32cf4feb590c4e93545 to your computer and use it in GitHub Desktop.

Select an option

Save dbolser-ebi/b32cf4feb590c4e93545 to your computer and use it in GitHub Desktop.
[dbolser@ebi-001 VCFLoad_simple]$ cat calculate_frequency.sql
-- Try adding an index...
#ALTER TABLE tmp_individual_genotype_single_bp
# ADD INDEX allele_1_idx (allele_1),
# ADD INDEX allele_2_idx (allele_2);
# Query OK, 277545309 rows affected (43 min 29.78 sec)
-- The above index seems to have no impact on query execution time
-- below, as the individual_id_idx is picked instead of either of
-- these allele indexes!
-- Make a table with the allele_code IDs (and population_ids) instead
-- of allele strings (and individual_ids). Bizarrely adding the JOIN
-- to individual_population actually speeds up the query by a factor
-- of approximately 2.
OPTIMIZE TABLE allele_code;
OPTIMIZE TABLE tmp_individual_genotype_single_bp;
OPTIMIZE TABLE individual_genotype_multiple_bp;
DROP TABLE IF EXISTS temp_individual_genotype;
CREATE TEMPORARY TABLE temp_individual_genotype(
INDEX (variation_id, population_id),
INDEX (variation_id, population_id, allele_code_id_1),
INDEX (variation_id, population_id, allele_code_id_2),
INDEX (variation_id, population_id, allele_code_id_g)
)
#EXPLAIN
SELECT
variation_id, population_id,
one.allele_code_id AS allele_code_id_1,
two.allele_code_id AS allele_code_id_2,
CONCAT(
one.allele_code_id, ':',
two.allele_code_id) AS allele_code_id_g
FROM
tmp_individual_genotype_single_bp
INNER JOIN
individual_population p
USING
(individual_id)
INNER JOIN
allele_code one ON allele_1 = one.allele
INNER JOIN
allele_code two ON allele_2 = two.allele
#
#LIMIT
# 0
;
# Query OK, 555,090,618 rows affected (3 hours 9 min 50.87 sec)
# Records: 555,090,618 Duplicates: 0 Warnings: 0
# Query OK, 832,635,927 rows affected (3 hours 34 min 39.96 sec)
# Records: 832,635,927 Duplicates: 0 Warnings: 0
# Query OK, 832,635,927 rows affected (5 hours 46 min 49.44 sec)
# Records: 832,635,927 Duplicates: 0 Warnings: 0
OPTIMIZE TABLE temp_individual_genotype;
# 1 row in set (10 min 50.60 sec)
# 1 row in set ( 7 min 19.21 sec)
# 1 row in set ( 7 min 55.38 sec)
-- Add the multiple BP data to the same table
INSERT INTO
temp_individual_genotype
SELECT
variation_id, population_id,
one.allele_code_id AS allele_code_id_1,
two.allele_code_id AS allele_code_id_2,
CONCAT(
one.allele_code_id, ':',
two.allele_code_id) AS allele_code_id_g
FROM
individual_genotype_multiple_bp
INNER JOIN
individual_population p
USING
(individual_id)
INNER JOIN
allele_code one ON allele_1 = one.allele
INNER JOIN
allele_code two ON allele_2 = two.allele
#
#LIMIT
# 50000
;
#Query OK, 94,259,724 rows affected (29 min 33.71 sec)
#Records: 94,259,724 Duplicates: 0 Warnings: 0
OPTIMIZE TABLE temp_individual_genotype;
#1 row in set (1 min 5.82 sec)
-- Now make tables for each allele, genotype, and the total...
-- ALLELE (in three steps)
-- Allele 1
DROP TABLE IF EXISTS temp_individual_genotype_ax;
CREATE TEMPORARY TABLE temp_individual_genotype_ax
-- Adding N into the index here avoids a costly data look-up later.
(INDEX pk_ish (variation_id, population_id, allele_code_id, N))
#EXPLAIN
SELECT variation_id, population_id,
allele_code_id_1 AS allele_code_id,
COUNT(*) AS N
FROM temp_individual_genotype
GROUP BY variation_id, population_id, allele_code_id_1;
#Query OK, 327889779 rows affected (29 min 32.86 sec)
OPTIMIZE TABLE temp_individual_genotype_ax;
-- Allele 2
INSERT INTO temp_individual_genotype_ax
#EXPLAIN
SELECT variation_id, population_id,
allele_code_id_2 AS allele_code_id,
COUNT(*) AS N
FROM temp_individual_genotype
GROUP BY variation_id, population_id, allele_code_id_2;
#Query OK, 290316464 rows affected (25 min 17.58 sec)
OPTIMIZE TABLE temp_individual_genotype_ax;
-- and finally...
DROP TABLE IF EXISTS temp_individual_genotype_aa;
CREATE TEMPORARY TABLE temp_individual_genotype_aa
(PRIMARY KEY (variation_id, population_id, allele_code_id))
#EXPLAIN
SELECT
variation_id, population_id, allele_code_id,
COUNT(*) AS X, SUM(N) AS N
FROM
temp_individual_genotype_ax
GROUP BY
variation_id, population_id, allele_code_id
;
#Query OK, 399187969 rows affected (32 min 38.95 sec)
OPTIMIZE TABLE temp_individual_genotype_aa;
-- GENOTYPE
DROP TABLE IF EXISTS temp_individual_genotype_gg;
CREATE TEMPORARY TABLE temp_individual_genotype_gg
(PRIMARY KEY (variation_id, population_id, allele_code_id_g))
#EXPLAIN
SELECT
variation_id, population_id, allele_code_id_g,
COUNT(*) AS N
FROM
temp_individual_genotype
GROUP BY
variation_id, population_id, allele_code_id_g
;
#Query OK, 329562440 rows affected (1 hour 21 min 20.40 sec)
OPTIMIZE TABLE temp_individual_genotype_gg;
-- TOTAL
DROP TABLE IF EXISTS temp_individual_genotype_tt;
CREATE TEMPORARY TABLE temp_individual_genotype_tt
(PRIMARY KEY (variation_id, population_id))
#EXPLAIN
SELECT
variation_id, population_id,
COUNT(*) AS N
FROM
temp_individual_genotype
GROUP BY
variation_id, population_id
;
#Query OK, 283148189 rows affected (22 min 14.18 sec)
OPTIMIZE TABLE temp_individual_genotype_tt;
-- MAKE COUNTS LIKE THIS
-- Allele
RENAME TABLE allele TO allele_bk3;
CREATE TABLE allele LIKE allele_bk3;
INSERT INTO allele
(variation_id, allele_code_id, population_id, count, frequency)
SELECT
variation_id, allele_code_id, population_id,
a.N AS count,
-- We picked up a factor of two somewhere above
a.N / t.N / 2 AS frequency
FROM
temp_individual_genotype_aa a
INNER JOIN
temp_individual_genotype_tt t
USING
(variation_id, population_id)
#LIMIT
# 03
;
#Query OK, 399187969 rows affected, 1 warning (1 hour 42 min 43.03 sec)
OPTIMIZE TABLE allele;
-- Genotype
-- For convenience we need to select population genotype in same
-- format as allele_code_id_g above
DROP TABLE IF EXISTS temp_genotype_code;
CREATE TEMPORARY TABLE temp_genotype_code
(PRIMARY KEY (allele_code_id_g)) AS
SELECT
genotype_code_id,
CONCAT(
one.allele_code_id, ':',
two.allele_code_id) AS allele_code_id_g
FROM
genotype_code one
INNER JOIN
genotype_code two
USING
(genotype_code_id)
WHERE
one.haplotype_id = 1 AND
two.haplotype_id = 2
;
#Query OK, 1000424 rows affected (12.56 sec)
#Query OK, 1000424 rows affected (14.21 sec)
OPTIMIZE TABLE temp_genotype_code;
#1 row in set (0.27 sec)
RENAME TABLE population_genotype TO population_genotype_bk3;
CREATE TABLE population_genotype LIKE population_genotype_bk3;
INSERT INTO population_genotype
(variation_id, genotype_code_id, population_id, frequency)
SELECT
variation_id, genotype_code_id, population_id,
g.N / t.N AS frequency
FROM
temp_individual_genotype_gg g
INNER JOIN
temp_individual_genotype_tt t
USING
(variation_id, population_id)
INNER JOIN
temp_genotype_code
USING
(allele_code_id_g)
#LIMIT
# 03
;
#Query OK, 367,890,086 rows affected, 1 warning (2 hours 31 min 41.78 sec)
#Records: 367,890,086 Duplicates: 0 Warnings: 1
OPTIMIZE TABLE population_genotype;
#1 row in set (2 min 50.12 sec)
-- PRAY!
DROP TABLE IF EXISTS
temp_individual_genotype_tt,
temp_individual_genotype_gg,
temp_individual_genotype_aa,
temp_individual_genotype_ax,
temp_individual_genotype,
temp_genotype_code
;
[dbolser@ebi-001 VCFLoad_simple]$
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment