Created
February 9, 2012 07:19
-
-
Save markcode/1778074 to your computer and use it in GitHub Desktop.
Jot - Perl pixel server stored to Scribe
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package Analumic::Jotr; | |
# * | |
# * Analumic - Analytics Statistics Logger "Jot". | |
# * Perl (mod_perl module) pixel server stored to Scribe and enhanced by GeoIP. | |
# * See web server config info (web_server_config.conf). | |
# * http://analumic.com | |
# * | |
use strict; | |
#these modules are now loaded in apache config | |
# use Apache2::RequestRec (); | |
# use Apache2::RequestIO (); | |
use Apache2::Const -compile => qw(OK); | |
# use Apache2::RequestUtil; #YAAH!!! this passes all the cgi environment headers | |
# use APR::Table; | |
use POSIX qw(strftime); | |
use URI::Escape; | |
use Digest::MD5 qw(md5_hex); | |
use Data::Validate::URI qw(is_uri); | |
#these modules are now loaded in apache config | |
# use Scribe::Thrift::scribe; | |
# use Thrift::Socket; | |
# use Thrift::FramedTransport; | |
# use Thrift::BinaryProtocol; | |
# TO DO: what about tainted inputs like dangerous ua or referals etc, need a way to clean them!??? | |
#use Data::Dumper; | |
# --------------------------------------------------- SUBS | |
my $r; $r = ''; | |
sub handler { | |
$r = shift; | |
#$r->content_type('text/plain'); | |
#my $rh = Apache2::RequestUtil->request(); | |
#$rh->content_type('text/plain'); | |
#print Dumper(\%ENV); | |
run_jotr(); | |
} | |
sub display_pic { | |
$r->content_type('image/gif'); | |
$r->headers_out->set("Expires" => "Mon, 26 Jul 2005 05:00:00 GMT"); | |
$r->headers_out->set("Cache-Control" => "no-store, no-cache, must-revalidate"); | |
$r->headers_out->set("Pragma" => "no-cache"); | |
if ( $ENV{'DNT'} eq '1' ) { | |
$r->headers_out->set("DNT" => "1"); | |
} | |
$r->headers_out->set("Content-Length" => "43"); | |
printf "GIF89a\1\0\1\0%c\0\0%c%c%c\0\0\0%s,\0\0\0\0\1\0\1\0\0%c%c%c\1\0;", | |
144,255,0,0,1?pack("C8",33,249,4,5,16,0,0,0):"",2,2,4; | |
+0; | |
return Apache2::Const::OK; | |
} | |
sub trim { | |
my $string; | |
$string = $_[0]; | |
$string =~ s/^\s+//; | |
$string =~ s/\s+$//; | |
return $string; | |
} | |
sub quote_escape { | |
my $string; | |
$string = $_[0]; | |
$string =~ s/"/\\"/g; #replace " with \" cause " is the log format delimiator | |
return $string; | |
} | |
sub truncate_ip { | |
my $ip; | |
$ip = $_[0]; | |
if ( index($ip, '.') != -1 ) { | |
#ipv4 or mapped to ipv6 | |
return substr($ip, 0, rindex($ip, '.')) . '.0'; | |
} else { | |
#ipv6 | |
return substr($ip, 0, rindex($ip, ':')) . ':0'; | |
} | |
} | |
sub request_IP { | |
if ( $ENV{'GEOIP_ADDR'} ne '' ) { return $ENV{'GEOIP_ADDR'}; } | |
if ( $ENV{'HTTP_X_FORWARDED_FOR'} ne '' ) { return $ENV{'HTTP_X_FORWARDED_FOR'}; } | |
if ( $ENV{'REMOTE_ADDR'} ne '' ) { return $ENV{'REMOTE_ADDR'}; } | |
if ( $ENV{'HTTP_CLIENT_IP'} ne '' ) { return $ENV{'HTTP_CLIENT_IP'}; } | |
display_pic(); #else did fail | |
return Apache2::Const::OK; | |
} | |
sub clean_url { | |
my $url; | |
my $url = $_[0]; | |
my $https = 0; | |
if ( index($url, '/', 9) == -1 ) { $url = $url . '/'; } #make sure domain has trailing / | |
if (substr($url, 0, 8) eq 'https://') { | |
$https = 1; | |
} | |
#make sure domains http:// is not listed within the string so mysql can read first domain in string. | |
$url =~ s/http\:\/\///gi; | |
$url =~ s/https\:\/\///gi; | |
$url = substr($url, 0, 255); #max 255 chars | |
if ( $https == 1 ) { | |
return 'https://'.$url; | |
} else { | |
return 'http://'.$url; | |
} | |
} | |
sub check_url { | |
my $url; | |
my $url = $_[0]; | |
if(is_uri($url)){ | |
return 1; | |
} else { | |
return 0; | |
} | |
} | |
sub md5_to_int { | |
my $md5; | |
$md5 = $_[0]; | |
my $md5_int = ''; | |
my $i = 0; | |
my $md5_val; $md5_val = ''; | |
my @characters = ("a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","!"); | |
for ($i=0; $i<=32; $i++) { | |
$md5_val = substr($md5, $i, 1); | |
if ( length($md5_int) == 18 ) { | |
#cant have 0 as first cause mysql will trim length to 17 chars. | |
if ( substr($md5_int, 0, 1) eq '0' ) { | |
$md5_int = '9'.substr($md5_int, 1, length($md5_int)); #replace 0 with 9 | |
} | |
return $md5_int; | |
} | |
if ($md5_val =~ /^\d/) { # is a number | |
$md5_int .= $md5_val; | |
next; | |
} | |
my ($k, $r, $i); | |
for ($k = 0; $k < @characters; $k++) { | |
if ( $md5_val eq $characters[$k] ) { | |
$r = $k + 1; | |
$md5_int .= $r; | |
last; | |
} | |
} | |
} | |
if ( length($md5_int) < 18 ) { | |
my $len; $len = 0; | |
$len = 18 - length($md5_int); | |
my $roundn = ''; my $n; | |
for ($n = 0; $n < $len; $n++) { | |
$roundn .= '0'; | |
} | |
$md5_int = $md5_int . $roundn; | |
} | |
@characters = (); | |
return substr($md5_int,0,9); #32 bit | |
#return substr($md5_int,0,18); #64 bit | |
} | |
sub error_log { | |
my $bulog; my $aid; | |
$aid = $_[0]; | |
$bulog = $_[1]; | |
use Apache2::Log (); | |
Apache2::ServerRec::warn("[jot] [${aid}] ${bulog}"); | |
#writes to /var/log/apache2/error.log format (search for "[jot]") EG: | |
#[Mon Apr 04 08:31:20 2011] [warn] [jot] 2011-04-04 07:31:20 229254151336211252 JP 693a36b5b8edb232cb38edd5fa2ef844 "219.118.26.0" "Koshigaya" "Saitama" "AS" "35.883301 139.783295" "en" "http://blog.barcodestore.nl/" "http://www.moretechtips.net/2009/09/realtime-related-tweets-bar-another.html" "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1.7) Gecko/20091221 Firefox/3.5.7" | |
display_pic(); | |
return Apache2::Const::OK; | |
} | |
sub run_jotr() { | |
# Do Not Track | |
if ( $ENV{'DNT'} eq '1' ) { | |
display_pic(); return Apache2::Const::OK; | |
} | |
#referal | |
my $referal_url = ''; #reset | |
$referal_url = $ENV{'HTTP_REFERER'}; | |
if ( $referal_url eq '' ) { display_pic(); return Apache2::Const::OK; } | |
if ( substr($referal_url, 0, 4) ne 'http' ) { display_pic(); return Apache2::Const::OK; } | |
#$referal_url = uri_unescape($referal_url); | |
$referal_url = clean_url($referal_url); | |
#$referal_url = quote_escape($referal_url); | |
#print "referal_url: ${referal_url}\n"; | |
#referrer (js generated) | |
my $referrer_url = ''; #reset | |
$referrer_url = $ENV{'QUERY_STRING'}; | |
my $pos; | |
$pos = ''; | |
$pos = index($referrer_url, 'ref='); | |
if ( $pos > -1 ) { | |
$referrer_url = substr($referrer_url, $pos + 4); | |
$referrer_url = uri_unescape($referrer_url); | |
#was stripping out search term so skip trim url | |
#my $pos2; | |
#$pos2 = ''; | |
#$pos2 = index($referrer_url, '&'); | |
#if ( $pos2 > -1 ) { | |
# $referrer_url = substr($referrer_url, 0, $pos2); | |
#} | |
$referrer_url = clean_url($referrer_url); | |
if ( check_url($referrer_url) == 0 ) { display_pic(); return Apache2::Const::OK; } | |
$referrer_url = substr($referrer_url, 0, 255); #max 255 chars | |
#$referrer_url = quote_escape($referrer_url); | |
#print "referrer_url: ${referrer_url}\n"; | |
} | |
#valide urls are good format | |
if ( check_url($referal_url) == 0 ) { display_pic(); return Apache2::Const::OK; } | |
$referal_url = substr($referal_url, 0, 255); #max 255 chars | |
#user agent | |
my $referal_ua = trim($ENV{'HTTP_USER_AGENT'}); | |
if ( $referal_ua eq '' ) { display_pic(); return Apache2::Const::OK; } | |
$referal_ua = quote_escape($referal_ua); | |
$referal_ua = substr($referal_ua, 0, 512); #max 512 chars | |
#browser language if set (code, will convert to proper name in post-processing). | |
my $referal_lng = trim($ENV{'HTTP_ACCEPT_LANGUAGE'}); | |
if ( $referal_lng ne '' ) { | |
$pos = 0; | |
$pos = index($referal_lng, ','); | |
if ($pos > -1) { | |
$referal_lng = substr($referal_lng, 0, $pos); | |
} | |
$referal_lng = lc $referal_lng; | |
$referal_lng = substr($referal_lng, 0, 7); | |
$referal_lng = quote_escape($referal_lng); | |
} | |
#print "referal_lng: ${referal_lng}\n"; | |
# | |
my $cat = 'foo'; # scribe category | |
my $ip = request_IP(); | |
#print "ip: ${ip}\n"; | |
my $referal_3166_1 = $ENV{'GEOIP_COUNTRY_CODE'}; | |
if ( $referal_3166_1 eq '' ) { display_pic(); return Apache2::Const::OK; } | |
if ( $referal_3166_1 eq 'A1' ) { display_pic(); return Apache2::Const::OK; } #"Anonymous Proxy" | |
if ( $referal_3166_1 eq 'A2' ) { display_pic(); return Apache2::Const::OK; } #"Satellite Provider" | |
if ( $referal_3166_1 eq 'O1' ) { display_pic(); return Apache2::Const::OK; } #"Other Country" | |
if ( $referal_3166_1 eq 'EU' ) { display_pic(); return Apache2::Const::OK; } #"Europe" country yet to be given an ISO 3166 | |
if ( $referal_3166_1 eq 'AP' ) { display_pic(); return Apache2::Const::OK; } #"Asia/Pacific Region" country yet to be given an ISO 3166 | |
my $referal_city = $ENV{'GEOIP_CITY'}; | |
my $referal_region = $ENV{'GEOIP_REGION_NAME'}; | |
my $referal_contnt = $ENV{'GEOIP_CONTINENT_CODE'}; | |
my $referal_coords = $ENV{'GEOIP_LATITUDE'} . ' ' . $ENV{'GEOIP_LONGITUDE'}; | |
my $referal_ip_md5 = md5_hex($ip); | |
my $ip_tunc = truncate_ip($ip); | |
#print "ip_tunc: ${ip_tunc}\n"; | |
#2011-03-25 01:02:53 | |
my $gmt_dt = strftime "%Y-%m-%d %H:%M:%S", gmtime; | |
#print "gmt_dt: ${gmt_dt}\n"; | |
my $gmt_toH_dt = strftime "%Y-%m-%d %H", gmtime; #0000-00-00 00 (hourly for poll) | |
my $md5 = md5_hex($gmt_toH_dt.$referal_url.$ip.$referal_ua); #added ua cause networks could share same ip | |
my $uid = md5_to_int($md5); | |
#print "uid: ${uid}\n"; | |
if ( $referal_lng eq '' ) { $referal_lng = '-'; } | |
if ( $referal_city eq '' ) { $referal_city = '-'; } | |
if ( $referal_region eq '' ) { $referal_region = '-'; } | |
if ( $referal_contnt eq '' ) { $referal_contnt = '-'; } | |
if ( $referal_coords eq '' ) { $referal_coords = '-'; } | |
if ( $referal_coords eq ' ' ) { $referal_coords = '-'; } | |
if ( $referrer_url eq '' ) { $referrer_url = '-'; } | |
# --------------------------------------------------- LOG | |
my $log; | |
eval{ | |
$log = ''; | |
$log = "$gmt_dt $uid $referal_3166_1 $referal_ip_md5 \"$ip_tunc\" \"$referal_city\" \"$referal_region\" \"$referal_contnt\" \"$referal_coords\" \"$referal_lng\" \"$referal_url\" \"$referrer_url\" \"$referal_ua\""; | |
#print "log: ${log}\n"; | |
my $host = 'localhost'; | |
my $port = 1463; | |
my $socket = Thrift::Socket->new($host, $port); | |
my $transport = Thrift::FramedTransport->new($socket); | |
my $proto = Thrift::BinaryProtocol->new($transport); | |
my $client = Scribe::Thrift::scribeClient->new($proto, $proto); | |
$transport->open(); | |
#log message to cat | |
my $le = Scribe::Thrift::LogEntry->new({ category => $cat }); | |
$le->message($log); | |
$result = -1; | |
$result = $client->Log([ $le ]); | |
#print Dumper($result); | |
if ( $result != 0 ) { | |
error_log($cat, $log); #err log to error log | |
} | |
#these return generate perl error | |
#if ($result == Scribe::Thrift::ResultCode::TRY_LATER) { | |
# print STDERR "TRY_LATER\n"; | |
#} elsif ($result != Scribe::Thrift::ResultCode::OK) { | |
# print STDERR "Unknown result code: $result\n"; | |
#} | |
#close scribe | |
$transport->close(); | |
} or do { | |
### catch block | |
#print "FAILED to connect"; | |
error_log($cat, $log); | |
}; | |
display_pic(); | |
return Apache2::Const::OK; | |
} | |
1; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# web server config in eg: /etc/apache2/sites-available/default | |
<IfModule mod_perl.c> | |
<Files ~ "\.gif$"> | |
SetHandler perl-script | |
PerlHandler Analumic::Jotr | |
PerlSendHeader On | |
</Files> | |
PerlSwitches -wT | |
PerlModule Scribe::Thrift::scribe | |
PerlModule Thrift::Socket | |
PerlModule Thrift::FramedTransport | |
PerlModule Thrift::BinaryProtocol | |
PerlModule Apache2::RequestRec | |
PerlModule Apache2::RequestIO | |
PerlModule Apache2::RequestUtil | |
PerlModule APR::Table | |
</IfModule> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment