Last active
August 29, 2015 14:10
-
-
Save miyagawa/3c1ac36f8856b6d790ed to your computer and use it in GitHub Desktop.
Analyze podcast feed clients
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use strict; | |
use warnings; | |
use JSON; | |
my $matching_path = qr!^/rebuildfm!; | |
my @UA = ( | |
[qr/^Podcasts?\/\d/ => "Apple Podcasts"], | |
[qr/^iTunes\/[\d\.]+ \(Macintosh/ => "iTunes (OS X)"], | |
[qr/^iTunes\/[\d\.]+ \(Windows;/ => "iTunes (Windows)"], | |
[qr/^iTunes\/[\d\.]+ Downcast\// => "Downcast"], | |
[qr/^RSS_Radio\/\d+/ => "RSS Radio"], | |
[qr/^livedoor FeedFetcher/ => "Livedoor Reader"], | |
[qr/ theoldreader\.com;/ => "The Old Reader"], | |
[qr/^NewsBlur Feed Fetcher / => "NewsBlur"], | |
[qr/^Digg Feed Fetcher/ => "Digg" ], | |
[qr/^Feedfetcher-Google; / => "Google Reader"], | |
[qr/ BeyondPod\)$/ => "BeyondPod"], | |
[qr/ DoggCatcher$/ => "DoggCatcher"], | |
[qr/ Feedeen / => "Feedeen"], | |
[qr/ inoreader\.com-like FeedFetcher/ => "Inoreader"], | |
[qr/ podcast\.de\/\d/ => "Podcast.de"], | |
[qr/ BazQux\/[\d\.]+;/, "BazQux"], | |
[qr/ ShiroyagiRssCrawler/ => "Shiroyagi"], | |
); | |
sub parse_ua { | |
my $str = shift; | |
return "Unknown" if $str eq "-" || $str =~ /^\d+$/; | |
my $agent; | |
for (@UA) { | |
my($re, $result) = @$_; | |
if ($str =~ $re) { | |
$agent = $result; | |
last; | |
} | |
} | |
# Generic hosted feed crawler | |
my($subs, $feed_id); | |
if ($str =~ /\b(\d+) subscriber/) { | |
$subs = $1; | |
} | |
if ($str =~ /feed[-_]?id=(\w+)/) { | |
$feed_id = $1; | |
} | |
unless ($agent) { | |
$agent = $str; | |
$agent =~ s/ *https?:\/\/.*$//; | |
$agent =~ s/ *\(.*$//; | |
$agent =~ s/\/.*$//; | |
$agent =~ s/ [\d\.]+$//; | |
$agent =~ s/ - .*$//; | |
} | |
return $agent, $subs, $feed_id; | |
} | |
my %hits; | |
while (<>) { | |
my($ip, $path, $agent_string) = /^([0-9\.]+) .*?"[A-Z]+ ([^ ]+) .*"(.*?)" ".*?"$/; | |
next unless $ip && $path =~ $matching_path; | |
my($agent, $subs, $id) = parse_ua($agent_string); | |
if ($subs) { | |
$id ||= $path; | |
$hits{$agent}{$id} = $subs | |
if $subs > ($hits{$agent}{$id} || 0); | |
} else { | |
$hits{$agent}{_direct}{$ip} = 1; | |
} | |
} | |
my $total = 0; | |
my %agents; | |
for my $agent (keys %hits) { | |
my $agent_subs = 0; | |
if (my $directs = delete $hits{$agent}{_direct}) { | |
$agent_subs += keys %$directs; | |
} | |
for my $hosted_subs (values %{$hits{$agent}}) { | |
$agent_subs += $hosted_subs; | |
} | |
$agents{$agent} = $agent_subs; | |
$total += $agent_subs; | |
} | |
my @agents = map { +{ agent => $_, subscribers => $agents{$_} } } | |
sort { $agents{$b} <=> $agents{$a} } keys %agents; | |
print JSON::encode_json({ total => $total, agents => \@agents }); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment