Skip to content

Instantly share code, notes, and snippets.

@taiyoh
Created November 23, 2010 08:19
Show Gist options
  • Save taiyoh/711450 to your computer and use it in GitHub Desktop.
Save taiyoh/711450 to your computer and use it in GitHub Desktop.
#!perl -w
use strict;
use utf8;
use Encode qw/decode_utf8 encode_utf8/;
use LWP::UserAgent;
use URI;
use JSON::XS;
use URI::Escape::XS;
use HTML::TreeBuilder::XPath;
my $word = shift or die;
$word = decode_utf8($word);
my $json = JSON::XS->new->ascii->utf8(1);
my $ua = LWP::UserAgent->new;
$ua->agent('Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.44 Safari/534.7');
$ua->cookie_jar({});
my $uri = URI->new('http://www.google.co.jp/search');
$uri->query_form({
q => $word,
hl => 'ja',
prmd => 'num',
tbo => 'u',
sa => 'X',
oi => 'realtime_result_group_more_results_link',
ct => 'title',
resnum => 2,
tbs => 'mbl:1'
});
my $res = $ua->get($uri);
my $html = $res->decoded_content;
my ($request_url) = ($html =~ /"nextRequest":"(.+?)"/);
$request_url = decode_utf8(sanitize($request_url));
while (1) {
#print encode_utf8("[URL] $request_url\n\n");
my $res = $ua->get("http://www.google.co.jp${request_url}");
my $data = $res->content;
$data =~ s{^mbrt0\.insert\('(.+)'\);$}{$1};
$data = $json->decode(sanitize($data));
for my $result (@{ $data->{results} || [] }) {
my $date = $result->{date};
my $url = $result->{url};
(my $html = $result->{html}) =~ s{\\'}{'}g;
my $tree = HTML::TreeBuilder::XPath->new;
$tree->parse_content($html);
my $title = $tree->findvalue('/html/body//tr/td[2]/div/div[1]');
my $from = $tree->findvalue('/html/body//tr/td[2]/div/div[1]/span/a/span');
my $via = $tree->findvalue('/html/body//tr/td[2]/div/div[2]/span[1]');
my $d =
$tree->findvalue('/html/body//tr/td[2]/div/div[2]/a[1]/span/text()')
|| $tree->findvalue('/html/body//tr/td[2]/div/div[3]/a[1]/span/text()');
$title =~ s{^(.+?).:.}{};
if ($via) {
$from = "\@${from}";
}
else {
$via = $tree->findvalue('/html/body//tr/td[2]/div/div[2]/cite');
}
print encode_utf8("${date}> ${title} [${from}][via ${via}][${d}]\n");
#print $tree->as_HTML ."\n" unless $via;
}
$request_url = $data->{nextRequest};
sleep 10;
};
sub sanitize {
my $str = shift;
$str =~ s{\\x}{%}g;
$str = decodeURIComponent($str);
$str =~ s{\\=}{=}g;
$str =~ s{\\&}{&}g;
$str =~ s{\\<}{<}g;
$str =~ s{\\>}{>}g;
$str =~ s{\'}{\\'}g;
return $str;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment