Skip to content

Instantly share code, notes, and snippets.

@milimetric
Created October 1, 2015 15:08
Show Gist options
  • Save milimetric/d18d60d48240107768c3 to your computer and use it in GitHub Desktop.
Save milimetric/d18d60d48240107768c3 to your computer and use it in GitHub Desktop.
If you want to look at old sampled logs with the new refinery approach (UDFs, etc.) use this
ADD JAR /srv/deployment/analytics/refinery/artifacts/refinery-hive.jar;
CREATE TEMPORARY FUNCTION client_ip as 'org.wikimedia.analytics.refinery.hive.ClientIpUDF';
CREATE TEMPORARY FUNCTION geocoded_data as 'org.wikimedia.analytics.refinery.hive.GeocodedDataUDF';
CREATE TEMPORARY FUNCTION is_pageview as 'org.wikimedia.analytics.refinery.hive.IsPageviewUDF';
CREATE TEMPORARY FUNCTION get_access_method as 'org.wikimedia.analytics.refinery.hive.GetAccessMethodUDF';
CREATE TEMPORARY FUNCTION classify_referer AS 'org.wikimedia.analytics.refinery.hive.RefererClassifierUDF';
CREATE TEMPORARY FUNCTION is_wikimedia_bot as 'org.wikimedia.analytics.refinery.hive.IsWikimediaBotUDF';
CREATE TEMPORARY FUNCTION ua_parser as 'org.wikimedia.analytics.refinery.hive.UAParserUDF';
CREATE TEMPORARY FUNCTION is_spider as 'org.wikimedia.analytics.refinery.hive.IsSpiderUDF';
SELECT geocoded_data(client_ip(ip, x_forwarded_for))['continent'] as continent,
substr(dt, 0, 10) as ymd,
get_access_method(uri_host, user_agent) as access_method,
classify_referer(referer) as referrer_class,
count(*) as view_count
FROM milimetric.webrequest_sampled
is_pageview(uri_host, uri_path, uri_query, http_status, content_type, user_agent, x_analytics) as is_pageview,
WHERE is_pageview(
<<uri_host>>, <<uri_path>>, <<uri_query>>,
status, content_type, user_agent, null
) = 1
and substr(dt, 0, 10) between '2014-08-01' and '2014-08-02'
and CASE
WHEN ((is_wikimedia_bot(user_agent))) THEN 'bot'
WHEN ((ua_parser(user_agent)['device_family'] = 'Spider') OR (is_spider(user_agent))) THEN 'spider'
ELSE 'user'
END = 'user'
and get_access_method(uri_host, user_agent) in ('desktop', 'mobile')
GROUP BY
geocoded_data(client_ip(ip, x_forwarded_for))['continent'],
substr(dt, 0, 10),
classify_referer(referer),
get_access_method(uri_host, user_agent)
;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment