Skip to content

Instantly share code, notes, and snippets.

@nuria
Last active October 3, 2018 23:53
Show Gist options
  • Select an option

  • Save nuria/c0d5320655ba8c45136a16a2f9dea7ea to your computer and use it in GitHub Desktop.

Select an option

Save nuria/c0d5320655ba8c45136a16a2f9dea7ea to your computer and use it in GitHub Desktop.
-- make session dataset samller to be able to try things fast
--create table session_tryouts as select * from classifier_data_sorted a where a.sessionId in (select distinct s.sessionId from classifier_data_sorted s limit 100);
drop table if exists classifier_data_label;
create table
classifier_data_label
as
select
sessionId,
(unix_timestamp(max(ts)) - unix_timestamp( min(ts))) as length,
count(*) number_of_requests,
(unix_timestamp(max(ts)) - unix_timestamp( min(ts)))/count(*) as request_ratio,
agent_type,
case
when count(*) > 10 and (unix_timestamp(max(ts)) - unix_timestamp( min(ts)))/count(*) <= 20 then 'bot'
when count(*) < 2 and agent_type!='spider' then 'user'
when length(user_agent) > 400 then 'bot'
end as label
from
nuria.session_tryouts
group by
sessionId, agent_type, user_agent;
-- now calculate the volume of requests that this represents
-- >>> 21101/365207.0 *100 -> 5.77%
select sum(number_of_requests) from classifier_data_label where label="bot" and agent_type="user";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment