Skip to content

Instantly share code, notes, and snippets.

@allthesignals
Created February 13, 2018 20:22
Show Gist options
  • Select an option

  • Save allthesignals/bd62c661f17713a401ac146ab5c980dc to your computer and use it in GitHub Desktop.

Select an option

Save allthesignals/bd62c661f17713a401ac146ab5c980dc to your computer and use it in GitHub Desktop.
WITH
filtered_selection AS (
SELECT *
FROM demographic
WHERE geoid IN ('1000201','1000202','1000600','1000800','1001001','1001002','1001200','1001401','1001402','1001600','1001800','1002000','1002201','1002202','1002400','1002500','1002601','1002602','1002700','1002800','1003001','1003002','1003200','1003400','1003601','1003602','1003800','1004000','1004400','1004800')
),
enriched_selection AS (
SELECT *
FROM filtered_selection
INNER JOIN support_fact_finder_metadata_v3
ON support_fact_finder_metadata_v3.variablename = filtered_selection.variable
),
main_numbers AS (
SELECT
*,
-- cv --
(((m / 1.645) / NULLIF(SUM,0)) * 100) AS cv,
-- previous_sum --
CASE
WHEN is_most_recent THEN
lag(sum) over (order by variable, dataset)
END AS previous_sum,
-- previous_m --
CASE
WHEN is_most_recent THEN
lag(m) over (order by variable, dataset)
END AS previous_m
FROM (
SELECT
-- sum --
sum(e) AS sum,
-- m --
sqrt(sum(power(m, 2))) AS m,
-- is_most_recent --
CASE
WHEN max(dataset) over () = dataset THEN
TRUE
ELSE
FALSE
END AS is_most_recent,
base,
category,
variable,
profile,
dataset
FROM enriched_selection
GROUP BY variable, dataset, base, category, profile
) x
),
base_numbers AS (
SELECT
-- base_sum --
sum(e) AS base_sum,
-- base_m --
sqrt(sum(power(m, 2))) AS base_m,
-- base_join --
max(base) AS base_join,
-- base_dataset --
max(dataset) AS base_dataset,
-- previous_base_sum --
lag(sum(e)) over (order by variable, dataset) AS previous_base_sum,
-- previous_base_m --
lag(sqrt(sum(power(m, 2)))) over (order by variable, dataset) AS previous_base_m
FROM enriched_selection
WHERE base = variable
GROUP BY variable, dataset
),
comparison_selection AS (
SELECT *
FROM demographic
WHERE geoid = '0'
),
comparison_enriched_selection AS (
SELECT *
FROM comparison_selection
INNER JOIN support_fact_finder_metadata_v3
ON support_fact_finder_metadata_v3.variablename = comparison_selection.variable
),
comparison_main_numbers AS (
SELECT
*
FROM (
SELECT
-- comparison_sum --
e AS comparison_sum,
-- comparison_m --
m AS comparison_m,
-- percent --
p AS comparison_percent,
-- comparison_percent_m --
z AS comparison_percent_m,
-- comparison_cv --
c AS comparison_cv,
-- comparison_join --
base AS comparison_join,
-- comparison_variable --
variable AS comparison_variable,
-- comparison_dataset --
dataset AS comparison_dataset
FROM comparison_enriched_selection
) x
)
SELECT
*,
-- significant --
CASE
WHEN ABS(SQRT(POWER(m / 1.645, 2) + POWER(comparison_m / 1.645, 2)) * 1.645) > ABS(comparison_sum - sum) THEN false
ELSE true
END AS significant,
-- percent_significant --
CASE
WHEN ABS(SQRT(POWER(percent_m / 1.645, 2) + POWER(comparison_percent_m / 1.645, 2)) * 1.645) > ABS(comparison_percent - percent) THEN false
ELSE true
END AS percent_significant,
-- difference_sum --
(sum - comparison_sum) AS difference_sum,
-- difference_percent --
CASE
WHEN (((percent - comparison_percent) * 100) < 0 AND ((percent - comparison_percent) * 100) > -0.05) THEN
0
ELSE
(percent - comparison_percent) * 100
END AS difference_percent,
-- difference_m --
(SQRT((POWER(m, 2) + POWER(comparison_m, 2)))) AS difference_m,
-- difference_percent_m --
(SQRT((POWER(percent_m * 100, 2) + POWER(comparison_percent_m * 100, 2)))) AS difference_percent_m,
-- change_percentage_point --
CASE
WHEN is_most_recent THEN
percent - previous_percent
END AS change_percentage_point,
-- change_percentage_point_m --
CASE
WHEN is_most_recent THEN
(SQRT((POWER(previous_percent_m, 2) + POWER(percent_m, 2))))
END AS change_percentage_point_m,
-- change_significant --
CASE
WHEN (change_m < ABS(change_sum)) THEN
TRUE
ELSE
FALSE
END AS change_significant,
-- change_percent_significant --
CASE
WHEN (change_percent_m < ABS(change_percent)) THEN
TRUE
ELSE
FALSE
END AS change_percent_significant,
-- change_percentage_point_significant --
CASE
WHEN (ABS((SQRT((POWER(previous_percent_m, 2) + POWER(percent_m, 2))))) < (percent - previous_percent)) THEN
TRUE
ELSE
FALSE
END AS change_percentage_point_significant
FROM (
SELECT
-- id --
ENCODE(CONVERT_TO(variable || dataset, 'UTF-8'), 'base64') AS id,
base,
-- variablename --
variable AS variablename,
category,
-- dataset --
regexp_replace(lower(dataset), '[^A-Za-z0-9]', '_', 'g') AS dataset,
-- profile --
regexp_replace(lower(profile), '[^A-Za-z0-9]', '_', 'g') AS profile,
-- variable --
regexp_replace(lower(variable), '[^A-Za-z0-9]', '_', 'g') AS variable,
is_most_recent,
sum,
m,
cv,
-- percent --
ROUND((SUM / NULLIF(base_sum,0))::numeric, 4) AS percent,
-- previous_percent --
ROUND((previous_sum / NULLIF(previous_base_sum,0))::numeric, 4) AS previous_percent,
previous_sum,
previous_m,
-- percent_m --
CASE
WHEN (POWER(m, 2) - POWER(sum / NULLIF(base_sum,0), 2) * POWER(base_m, 2)) < 0
THEN (1 / NULLIF(base_sum,0)) * SQRT(POWER(m, 2) + POWER(sum / NULLIF(base_sum,0), 2) * POWER(base_m, 2))
ELSE (1 / NULLIF(base_sum,0)) * SQRT(POWER(m, 2) - POWER(sum / NULLIF(base_sum,0), 2) * POWER(base_m, 2))
END AS percent_m,
-- previous_percent_m --
CASE
WHEN (POWER(previous_m, 2) - POWER(previous_sum / NULLIF(previous_base_sum,0), 2) * POWER(previous_base_m, 2)) < 0
THEN (1 / NULLIF(previous_base_sum,0)) * SQRT(POWER(previous_m, 2) + POWER(previous_sum / NULLIF(previous_base_sum,0), 2) * POWER(previous_base_m, 2))
ELSE (1 / NULLIF(previous_base_sum,0)) * SQRT(POWER(previous_m, 2) - POWER(previous_sum / NULLIF(previous_base_sum,0), 2) * POWER(previous_base_m, 2))
END AS previous_percent_m,
-- is_reliable --
CASE
WHEN (cv < 20)
THEN true
ELSE false
END AS is_reliable,
comparison_cv,
comparison_m,
comparison_sum,
comparison_percent_m,
comparison_percent,
-- comparison_is_reliable --
CASE
WHEN (comparison_cv < 20)
THEN true
ELSE false
END AS comparison_is_reliable,
-- change_sum --
CASE
WHEN is_most_recent THEN
sum - previous_sum
END AS change_sum,
-- change_m --
CASE
WHEN is_most_recent THEN
ABS(SQRT(POWER(m, 2) + POWER(previous_m, 2)))
END AS change_m,
-- change_percent --
CASE
WHEN is_most_recent THEN
ROUND(((sum - previous_sum) / NULLIF(previous_sum,0))::numeric, 4)
END AS change_percent,
-- change_percent_m --
CASE
WHEN is_most_recent THEN
ABS(sum / NULLIF(previous_sum,0))
* SQRT(
(POWER(m / 1.645, 2) / POWER(sum, 2))
+ (POWER(previous_m / 1.645, 2) / POWER(previous_sum, 2))
) * 1.645
END AS change_percent_m
FROM main_numbers
INNER JOIN comparison_main_numbers
ON main_numbers.variable = comparison_main_numbers.comparison_variable
AND main_numbers.dataset = comparison_main_numbers.comparison_dataset
LEFT OUTER JOIN base_numbers
ON main_numbers.base = base_numbers.base_join
AND main_numbers.dataset = base_numbers.base_dataset
) precalculations
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment