Skip to content

Instantly share code, notes, and snippets.

@itochu0523
Last active November 10, 2015 05:32
Show Gist options
  • Save itochu0523/8d38b1fc139f9a39de81 to your computer and use it in GitHub Desktop.
Save itochu0523/8d38b1fc139f9a39de81 to your computer and use it in GitHub Desktop.
簡単なセッションテーブルからパス解析のテーブルを作成する手順 ref: http://qiita.com/itochu0523/items/a918a899152b5adacdd3
SELECT
TD_TIME_FORMAT(TD_TIME_PARSE(session_start_time),'yyyy-MM-dd') as date
,count(distinct session_id) as session_cnt
FROM
session_summary
GROUP BY
TD_TIME_FORMAT(TD_TIME_PARSE(session_start_time),'yyyy-MM-dd')
ORDER BY
TD_TIME_FORMAT(TD_TIME_PARSE(session_start_time),'yyyy-MM-dd')
-- hive or presto
SELECT
a.td_url as td_url
,ROW_NUMBER() OVER ( ORDER BY a.cnt DESC ) AS rownum
FROM
(
SELECT
regexp_replace(td_url, '(http://|https://)', '') as td_url
,count(1) as cnt
FROM access
-- 2ヶ月間のマスタを作成
WHERE TD_TIME_RANGE(time, '2015-04-01', '2015-05-31', 'JST')
GROUP BY regexp_replace(td_url, '(http://|https://)', '')
) a
SELECT
a.td_url as td_url
,ROW_NUMBER() OVER ( ORDER BY a.cnt DESC ) AS rownum
FROM
(
SELECT
regexp_replace(td_url, '(http://|https://)', '') as td_url
,count(1) as cnt
FROM access
WHERE TD_TIME_RANGE(time, '2015-06-01', TD_TIME_ADD('2015-06-01','1d'), 'JST')
GROUP BY regexp_replace(td_url, '(http://|https://)', '')
) a
-- hive or presto
SELECT
c.td_url as td_url
,(d.max_rownum + c.rownum) as rownum
FROM
(
select
a.td_url as td_url
,ROW_NUMBER() OVER ( ORDER BY b.rownum DESC ) AS rownum
from
tmp_page_master a
LEFT OUTER JOIN
page_master b
ON
a.td_url = b.td_url
WHERE b.rownum is null
) c
,
( select max(rownum) as max_rownum from page_master ) d
SELECT
td_url
, rownum as page_id
,(CASE
-- 以下の行に足していく...
WHEN td_url like '%thank%' THEN 1
WHEN td_url LIKE '%THANK%' THEN 1
WHEN td_url LIKE '%Thank%' THEN 1
ELSE 0
END) as conversion_flag
FROM page_master
SELECT
b.session_id as session_id
, b.time as time
, b.td_ip as td_ip
, b.td_path as td_path
, b.td_client_id as td_client_id
, b.td_title as td_title
, b.mobile_flag as mobile_flag
, b.td_browser as td_browser
, b.td_color as td_color
, b.td_os_version as td_os_version
, b.td_browser_version as td_browser_version
, b.td_referrer as td_referrer
, b.search_word as search_word
, b.td_screen as td_screen
, b.td_os as td_os
, b.td_host as td_host
, b.td_url as td_url
, (CASE WHEN c.conversion_flag = 1 THEN 1 ELSE 0 END) as conversion_flag
, c.page_id as page_id
, b.td_language as td_language
FROM
(
SELECT
TD_SESSIONIZE(time, 86400, td_ip) as session_id
, time
, td_ip
, td_path
, td_client_id
, td_title
, td_browser
, td_color
, td_os_version
, td_browser_version
, td_referrer
, td_screen
, td_os
, td_host
, td_url
, regexp_replace(td_url, '(http://|https://)', '') as url
, td_language
, (CASE
WHEN td_referrer like '%google%' THEN parse_url(TD_URL_DECODE(td_referrer),'QUERY','q')
WHEN td_referrer like '%bing.com%' THEN parse_url(TD_URL_DECODE(td_referrer),'QUERY','q')
WHEN td_referrer like '%yahoo%' THEN parse_url(TD_URL_DECODE(td_referrer),'QUERY','p')
ELSE parse_url(TD_URL_DECODE(td_referrer),'QUERY','p')
END) as search_word
, (CASE WHEN td_os IN ('iOS','BlackBerry OS','Android','Windows Phone') THEN 1 ELSE 0 END) as mobile_flag
FROM
(
SELECT time, td_ip, td_path, td_client_id, td_title
,td_browser, td_color, td_os_version
,td_browser_version, td_referrer, td_screen
,td_os, td_host, td_url, td_language
FROM access
distribute by td_ip
sort by td_ip,time
) a
) b
,page_master_detail c
WHERE b.url = c.td_url
SELECT
session_id
, td_client_id
, collect_list(CAST(page_id as STRING)) as path
, count(1) as page_cnt
FROM session_detail
GROUP BY
session_id, td_client_id
SELECT
session_id,
pid
FROM
session_path
LATERAL VIEW
explode(PATH) pageTable AS pid
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment