Created
October 1, 2015 15:08
-
-
Save milimetric/d18d60d48240107768c3 to your computer and use it in GitHub Desktop.
If you want to look at old sampled logs with the new refinery approach (UDFs, etc.) use this
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ADD JAR /srv/deployment/analytics/refinery/artifacts/refinery-hive.jar; | |
CREATE TEMPORARY FUNCTION client_ip as 'org.wikimedia.analytics.refinery.hive.ClientIpUDF'; | |
CREATE TEMPORARY FUNCTION geocoded_data as 'org.wikimedia.analytics.refinery.hive.GeocodedDataUDF'; | |
CREATE TEMPORARY FUNCTION is_pageview as 'org.wikimedia.analytics.refinery.hive.IsPageviewUDF'; | |
CREATE TEMPORARY FUNCTION get_access_method as 'org.wikimedia.analytics.refinery.hive.GetAccessMethodUDF'; | |
CREATE TEMPORARY FUNCTION classify_referer AS 'org.wikimedia.analytics.refinery.hive.RefererClassifierUDF'; | |
CREATE TEMPORARY FUNCTION is_wikimedia_bot as 'org.wikimedia.analytics.refinery.hive.IsWikimediaBotUDF'; | |
CREATE TEMPORARY FUNCTION ua_parser as 'org.wikimedia.analytics.refinery.hive.UAParserUDF'; | |
CREATE TEMPORARY FUNCTION is_spider as 'org.wikimedia.analytics.refinery.hive.IsSpiderUDF'; | |
SELECT geocoded_data(client_ip(ip, x_forwarded_for))['continent'] as continent, | |
substr(dt, 0, 10) as ymd, | |
get_access_method(uri_host, user_agent) as access_method, | |
classify_referer(referer) as referrer_class, | |
count(*) as view_count | |
FROM milimetric.webrequest_sampled | |
is_pageview(uri_host, uri_path, uri_query, http_status, content_type, user_agent, x_analytics) as is_pageview, | |
WHERE is_pageview( | |
<<uri_host>>, <<uri_path>>, <<uri_query>>, | |
status, content_type, user_agent, null | |
) = 1 | |
and substr(dt, 0, 10) between '2014-08-01' and '2014-08-02' | |
and CASE | |
WHEN ((is_wikimedia_bot(user_agent))) THEN 'bot' | |
WHEN ((ua_parser(user_agent)['device_family'] = 'Spider') OR (is_spider(user_agent))) THEN 'spider' | |
ELSE 'user' | |
END = 'user' | |
and get_access_method(uri_host, user_agent) in ('desktop', 'mobile') | |
GROUP BY | |
geocoded_data(client_ip(ip, x_forwarded_for))['continent'], | |
substr(dt, 0, 10), | |
classify_referer(referer), | |
get_access_method(uri_host, user_agent) | |
; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment