nkmrgk · February 29, 2012 16:51
diff --git a/googlehistory.pl b/googlehistory.pl
 #!/usr/bin/perl
 #
 # googlehistory.pl - Downloader for Google history
 #
 use strict;
 use warnings;
 use Encode;
 use utf8;
 use URI;
 use HTTP::Cookies;
 use WWW::Mechanize;
 use HTML::Entities;

 my $account = 'your-google-account';
 my $password = 'your-google-password';
 my $max_page = 0;
 my $wait_sec = 1;

 my $tsv = 'googlehistory.tsv';
 my $cache = 'googlehistory.cache_cookie';
 my $cache_expire = 60*60;
 my $http_timeout = 300;
 my $login_max_retry = 3;
 my $verbose = 1;
 my $console_encoding = 'cp932';

 sub get {
    my ($uri) = @_;
    my $mech = WWW::Mechanize->new(timeout => $http_timeout);
    my $cookie = HTTP::Cookies->new(file => $cache, autosave => 1);
    $mech->agent_alias('Windows IE 6');
    my $content;
    for (my $i=1; $i<=$login_max_retry; $i++) {
        if (-f $cache && time - scalar((stat($cache))[9]) < $cache_expire) {
            $mech->cookie_jar($cookie);
        }
        else {
            unlink $cache;
            $mech->cookie_jar($cookie);
            $mech->get('https://www.google.co.jp/accounts/Login');
            $mech->form_number(1);
            $mech->field(Email => $account);
            $mech->field(Passwd => $password);
            $mech->click();
        }
        warn "GET $uri\n";
        my $res;
        eval { $res = $mech->get($uri) };
        die $@ if ($@);
        $content = $res->decoded_content;
        Encode::_utf8_off($content);
        if ($content !~ /bkmk_href_/) {
            die "Login failed $i times." if ($i == $login_max_retry);
            warn "Login retry $i\n";
            unlink $cache;
            next;
        }
        # Success
        last;
    }
    return $content;
 }

 sub parse {
    my ($content) = @_;
    my $day = '';
    my $lines = '';
    my $nextlink;
    for (split(/<\/div>/, $content)) {
        if (m|<h1>(.*?)</h1>|) {
            $day = $1;
            $day =~ s/&nbsp;//;
            $day =~ s/\(.*?\)//g;
        }
        if (m|href="(.*?)".*? id="bkmk_href_">(.*?)</a>.*?(\d\d:\d\d)</td>|) {
            my ($link, $title, $hm) = ($1, $2, $3);
            $link = decode_entities($link);
            $lines .= join "\t", ($day, $hm, $title, $link);
            $lines .= "\n";
            if ($verbose) {
                $| = 1;
                print encode($console_encoding,
                             decode('utf8', "$day $hm $title\n"));
            }
        }
        if (m|title="(.*?)".*? id="bkmk_href_([\d\-]+)">(.*?)</a>|) {
            my ($link, $bkmk, $title) = ($1, $2, $3);
            $link = decode_entities($link);
            $lines .= join "\t", ($day, $bkmk, $title, $link);
            $lines .= "\n";
        }
        if (m|<a class="kd-button" href="(\./lookup\?hl=ja&max=\d+)">|) {
            $nextlink = $1;
            $nextlink =~ s|.*lookup|https://www.google.com/history/lookup|;
            last;
        }
    }
    return ($lines, $nextlink);
 }

 sub crowl_history {
    my ($uri) = @_;
    my $content = get($uri);
    my ($lines, $nextlink) = parse($content);
    open my $add, '>>', $tsv or die $!;
    print $add $lines;
    close $add;
    return $nextlink;
 }

 sub main {
    my ($uri) = @_;
    my $default_uri = 'https://www.google.com/history/lookup?hl=ja';
    $uri = $default_uri if (! $uri);
    for (my $i=1; $max_page ? $i<=$max_page : 1; $i++) {
        my $nextlink = crowl_history($uri);
        last if !$nextlink;
        $uri = $nextlink;
        sleep($wait_sec);
    }
 }

 my $uri = shift @ARGV; 
 main($uri);

 __END__
	#!/usr/bin/perl
	#
	# googlehistory.pl - Downloader for Google history
	#
	use strict;
	use warnings;
	use Encode;
	use utf8;
	use URI;
	use HTTP::Cookies;
	use WWW::Mechanize;
	use HTML::Entities;

	my $account = 'your-google-account';
	my $password = 'your-google-password';
	my $max_page = 0;
	my $wait_sec = 1;

	my $tsv = 'googlehistory.tsv';
	my $cache = 'googlehistory.cache_cookie';
	my $cache_expire = 60*60;
	my $http_timeout = 300;
	my $login_max_retry = 3;
	my $verbose = 1;
	my $console_encoding = 'cp932';

	sub get {
	my ($uri) = @_;
	my $mech = WWW::Mechanize->new(timeout => $http_timeout);
	my $cookie = HTTP::Cookies->new(file => $cache, autosave => 1);
	$mech->agent_alias('Windows IE 6');
	my $content;
	for (my $i=1; $i<=$login_max_retry; $i++) {
	if (-f $cache && time - scalar((stat($cache))[9]) < $cache_expire) {
	$mech->cookie_jar($cookie);
	}
	else {
	unlink $cache;
	$mech->cookie_jar($cookie);
	$mech->get('https://www.google.co.jp/accounts/Login');
	$mech->form_number(1);
	$mech->field(Email => $account);
	$mech->field(Passwd => $password);
	$mech->click();
	}
	warn "GET $uri\n";
	my $res;
	eval { $res = $mech->get($uri) };
	die $@ if ($@);
	$content = $res->decoded_content;
	Encode::_utf8_off($content);
	if ($content !~ /bkmk_href_/) {
	die "Login failed $i times." if ($i == $login_max_retry);
	warn "Login retry $i\n";
	unlink $cache;
	next;
	}
	# Success
	last;
	}
	return $content;
	}

	sub parse {
	my ($content) = @_;
	my $day = '';
	my $lines = '';
	my $nextlink;
	for (split(/<\/div>/, $content)) {
	if (m\|<h1>(.*?)</h1>\|) {
	$day = $1;
	$day =~ s/ //;
	$day =~ s/\(.*?\)//g;
	}
	if (m\|href="(.?)".? id="bkmk_href_">(.?)</a>.?(\d\d:\d\d)</td>\|) {
	my ($link, $title, $hm) = ($1, $2, $3);
	$link = decode_entities($link);
	$lines .= join "\t", ($day, $hm, $title, $link);
	$lines .= "\n";
	if ($verbose) {
	$\| = 1;
	print encode($console_encoding,
	decode('utf8', "$day $hm $title\n"));
	}
	}
	if (m\|title="(.?)".? id="bkmk_href_([\d\-]+)">(.*?)</a>\|) {
	my ($link, $bkmk, $title) = ($1, $2, $3);
	$link = decode_entities($link);
	$lines .= join "\t", ($day, $bkmk, $title, $link);
	$lines .= "\n";
	}
	if (m\|<a class="kd-button" href="(\./lookup\?hl=ja&max=\d+)">\|) {
	$nextlink = $1;
	$nextlink =~ s\|.*lookup\|https://www.google.com/history/lookup\|;
	last;
	}
	}
	return ($lines, $nextlink);
	}

	sub crowl_history {
	my ($uri) = @_;
	my $content = get($uri);
	my ($lines, $nextlink) = parse($content);
	open my $add, '>>', $tsv or die $!;
	print $add $lines;
	close $add;
	return $nextlink;
	}

	sub main {
	my ($uri) = @_;
	my $default_uri = 'https://www.google.com/history/lookup?hl=ja';
	$uri = $default_uri if (! $uri);
	for (my $i=1; $max_page ? $i<=$max_page : 1; $i++) {
	my $nextlink = crowl_history($uri);
	last if !$nextlink;
	$uri = $nextlink;
	sleep($wait_sec);
	}
	}

	my $uri = shift @ARGV;
	main($uri);

	__END__