Skip to content

Instantly share code, notes, and snippets.

@doryokujin
Last active December 7, 2018 05:36
Show Gist options
  • Save doryokujin/ec04127763152c050ad66eaef37e74f2 to your computer and use it in GitHub Desktop.
Save doryokujin/ec04127763152c050ad66eaef37e74f2 to your computer and use it in GitHub Desktop.
『Treasure Data でアクセスログ分析の限界に挑む』その② 〜アクセスに基づいたユーザーセグメントの作成 月次編(頻度)〜 ref: https://qiita.com/doryokujin/items/d4899269a0c0aa7db6ea
TD_DATE_TRUNC('day',time,'JST') AS access_day
/* TD_SCHEDULED_TIME() = '2018-11-23 11:11:00' */
/*
SELECT target_month, freq/*_per_week*/ ,COUNT(1) AS cnt, AVG(pv_daily_avg) AS pv_daily_avg, STDDEV(pv_daily_avg) pv_stdev
FROM
(
*/
SELECT
TD_TIME_FORMAT(TD_DATE_TRUNC('month', TD_DATE_TRUNC('month', TD_SCHEDULED_TIME(),'JST')-1, 'JST'),'yyyy-MM-dd','JST') AS target_month,
IF(past_month.td_client_id IS NOT NULL, past_month.td_client_id, past_month_ago.td_client_id ) AS td_client_id,
IF(freq IS NOT NULL, freq, 0) AS freq,
IF(freq IS NOT NULL, freq/4,-1) AS freq_per_week,
IF(pv IS NOT NULL, pv, 0) AS pv,
IF(pv_daily_avg IS NOT NULL,pv_daily_avg, 0) AS pv_daily_avg,
IF(pv_stdev IS NOT NULL, pv_stdev, 0) AS pv_stdev,
min_time, max_time
FROM
(
SELECT
td_client_id,
COUNT(1) AS freq, SUM(pv) AS pv, AVG(pv) AS pv_daily_avg, STDDEV(pv) AS pv_stdev,
TD_TIME_FORMAT(MIN(min_time),'yyyy-MM-dd','JST') AS min_time, TD_TIME_FORMAT(MAX(max_time),'yyyy-MM-dd','JST') AS max_time
FROM
(
SELECT td_client_id, TD_DATE_TRUNC('day',time,'JST') AS access_day, COUNT(1) AS pv, MIN(time) AS min_time, MAX(time) AS max_time
FROM pageviews
WHERE TD_INTERVAL(time, '-1M', 'JST')
GROUP BY td_client_id, TD_DATE_TRUNC('day',time,'JST')
)
GROUP BY td_client_id
) past_month
FULL OUTER JOIN
(
SELECT td_client_id
FROM pageviews
WHERE TD_INTERVAL(time, '-10y/-1M', 'JST') /* 1ヶ月前より過去 */
GROUP BY td_client_id
) past_month_ago
ON past_month.td_client_id = past_month_ago.td_client_id
/*
)
GROUP BY target_month, freq/*_per_week*/
ORDER BY freq/*_per_week*/ DESC
*/
/* TD_SCHEDULED_TIME() = '2018-11-23 11:11:00' */
/*
SELECT target_month, frequency_past_1, frequency_past_2, frequency_past_1-IF(frequency_past_2 IS NOT NULL,frequency_past_2,0) AS diff, COUNT(1) AS cnt
FROM
(
*/
SELECT
TD_TIME_FORMAT(TD_DATE_TRUNC('month', TD_DATE_TRUNC('month', TD_SCHEDULED_TIME(),'JST')-1, 'JST'),'yyyy-MM-dd','JST') AS target_month,
past_1month.td_client_id,
past_1month.freq_per_week AS frequency_past_1,
past_2month.freq_per_week AS frequency_past_2,
past_1month.min_time AS min_time_past_1, past_1month.max_time AS max_time_past_1,
past_2month.min_time AS min_time_past_2, past_2month.max_time AS max_time_past_2
FROM
(
SELECT
IF(past_month.td_client_id IS NOT NULL, past_month.td_client_id, past_month_ago.td_client_id ) AS td_client_id,
IF(freq IS NOT NULL, freq, 0) AS freq,
IF(freq IS NOT NULL, freq/4,-1) AS freq_per_week,
IF(pv IS NOT NULL, pv, 0) AS pv,
IF(pv_daily_avg IS NOT NULL,pv_daily_avg, 0) AS pv_daily_avg,
IF(pv_stdev IS NOT NULL, pv_stdev, 0) AS pv_stdev,
min_time, max_time
FROM
(
SELECT
td_client_id,
COUNT(1) AS freq, SUM(pv) AS pv, AVG(pv) AS pv_daily_avg, STDDEV(pv) AS pv_stdev,
TD_TIME_FORMAT(MIN(min_time),'yyyy-MM-dd','JST') AS min_time, TD_TIME_FORMAT(MAX(max_time),'yyyy-MM-dd','JST') AS max_time
FROM
(
SELECT td_client_id, TD_DATE_TRUNC('day',time,'JST') AS access_day, COUNT(1) AS pv, MIN(time) AS min_time, MAX(time) AS max_time
FROM pageviews
WHERE TD_INTERVAL(time, '-1M', 'JST')
GROUP BY td_client_id, TD_DATE_TRUNC('day',time,'JST')
)
GROUP BY td_client_id
) past_month
FULL OUTER JOIN
(
SELECT td_client_id
FROM pageviews
WHERE TD_INTERVAL(time, '-10y/-1M', 'JST') /* 1ヶ月前より過去 */
GROUP BY td_client_id
) past_month_ago
ON past_month.td_client_id = past_month_ago.td_client_id
) past_1month
LEFT OUTER JOIN /* 集合 past_1month は past_2month を内包する */
(
SELECT
IF(past_month.td_client_id IS NOT NULL, past_month.td_client_id, past_month_ago.td_client_id ) AS td_client_id,
IF(freq IS NOT NULL, freq, 0) AS freq,
IF(freq IS NOT NULL, freq/4,-1) AS freq_per_week,
IF(pv IS NOT NULL, pv, 0) AS pv,
IF(pv_daily_avg IS NOT NULL,pv_daily_avg, 0) AS pv_daily_avg,
IF(pv_stdev IS NOT NULL, pv_stdev, 0) AS pv_stdev,
min_time, max_time
FROM
(
SELECT
td_client_id,
COUNT(1) AS freq, SUM(pv) AS pv, AVG(pv) AS pv_daily_avg, STDDEV(pv) AS pv_stdev,
TD_TIME_FORMAT(MIN(min_time),'yyyy-MM-dd','JST') AS min_time, TD_TIME_FORMAT(MAX(max_time),'yyyy-MM-dd','JST') AS max_time
FROM
(
SELECT td_client_id, TD_DATE_TRUNC('day',time,'JST') AS access_day, COUNT(1) AS pv, MIN(time) AS min_time, MAX(time) AS max_time
FROM pageviews
WHERE TD_INTERVAL(time, '-1M/-1M', 'JST')
GROUP BY td_client_id, TD_DATE_TRUNC('day',time,'JST')
)
GROUP BY td_client_id
) past_month
FULL OUTER JOIN
(
SELECT td_client_id
FROM pageviews
WHERE TD_INTERVAL(time, '-10y/-2M', 'JST') /* 1ヶ月前より過去 */
GROUP BY td_client_id
) past_month_ago
ON past_month.td_client_id = past_month_ago.td_client_id
) past_2month
ON past_1month.td_client_id = past_2month.td_client_id
/*
)
GROUP BY target_month, frequency_past_1, frequency_past_2
*/
ORDER BY frequency_past_1 DESC, frequency_past_2 DESC
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment