Created
February 4, 2010 13:42
-
-
Save mtsukamoto/294617 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package WebScraper::Wassr::Timeline; | |
use warnings; | |
use strict; | |
use base qw(Class::Accessor::Fast); | |
use Carp; | |
use HTML::ResolveLink; | |
use URI::Escape; | |
use WWW::Mechanize; | |
use Web::Scraper; | |
WebScraper::Wassr::Timeline->mk_accessors(qw(user password next_url prev_url)); | |
sub ua { | |
my $self = shift; | |
unless ($self->{'ua'}) { | |
$self->{'ua'} = WWW::Mechanize->new; | |
$self->{'ua'}->env_proxy; | |
} | |
return $self->{'ua'}; | |
} | |
sub login { | |
my $self = shift; | |
return 0 if (not $self->user || not $self->password); | |
my $ua = $self->ua; | |
$ua->get('http://wassr.jp/')->is_success || die $ua->res->status_line; | |
return 0 if (not $ua->res && $ua->res->is_success); | |
return 0 if (not $ua->form_name('LoginForm')); | |
$ua->set_fields('login_id' => $self->user, 'login_pw' => $self->password); | |
$ua->submit_form->is_success || die $ua->res->status_line; | |
my $scraper = scraper { | |
process 'id("loginuser")', 'message' => 'TEXT'; | |
process 'id("login_error_message")', 'error' => 'TEXT'; | |
}; | |
my $result = $scraper->scrape($ua->res); | |
my $error = 'Login failed' if (not $result->{'message'}); | |
$error = $result->{'error'} if ($result->{'error'}); | |
if ($error) { | |
$error =~ s/\s+/ /g; | |
die $error; | |
} | |
return $result->{'message'}; | |
} | |
sub user_timeline { | |
my $self = shift; | |
my $args = shift || {}; | |
my $user = $args->{'user'} || $self->user; | |
my $page = $args->{'page'} || 1; | |
my $url = $args->{'url'}; | |
die "Page must be a number, '$page' is invalid." if ($page !~ /^\d+$/); | |
my $ua = $self->ua; | |
$url ||= sprintf('http://wassr.jp/user/%s?page=%s&ajax_response=0', uri_escape_utf8($user), $page); | |
$ua->get($url)->is_success || die $ua->res->status_line; | |
my $result = $self->_parse($ua->content, $ua->uri); | |
my $entries = $result->{'entries'}; | |
die "Something wrong in parsing on '$url'." if (not ref($entries) eq 'ARRAY'); | |
die "No entry exists on '$url'." if (not scalar @$entries); | |
$self->next_url($result->{'next'}); | |
$self->prev_url($result->{'prev'}); | |
return $result; | |
} | |
sub _parse { | |
my $self = shift; | |
my $content = shift; | |
my $url = shift; | |
my $scraper = $self->_scraper_wassr($url); | |
my $result = $scraper->scrape($content, $url); | |
foreach my $entry (@{$result->{'entries'}}) { | |
$entry->{'created_at'} =~ s/\(.*\)//; | |
$entry = _url_abs_stringify($entry, $url); | |
$entry = _entry_twitterify($entry, $url); | |
} | |
return $result; | |
} | |
sub _url_abs_stringify { | |
my $data = shift; | |
my $base = shift; | |
if (ref($data) eq 'ARRAY') { | |
$data->[$_] = _url_abs_stringify($data->[$_], $base) for (0..$#{$data}); | |
} elsif (ref($data) eq 'HASH') { | |
$data->{$_} = _url_abs_stringify($data->{$_}, $base) for (keys(%{$data})); | |
} elsif (UNIVERSAL::isa($data, 'URI')) { | |
$data = $data->abs($base)->as_string; | |
} | |
return $data; | |
} | |
sub _entry_twitterify { | |
my $entry = shift; | |
$entry->{'html'} = delete($entry->{'description'}); | |
$entry->{'html'} =~ s/^.*?<img alt="" class="inlineicon" height="16" src="http:\/\/wassr.jp\/img\/icn-balloon.gif" width="21" \/>//s; | |
$entry->{'html'} =~ s/\s*<a class="thickbox" .*?>.*?<\/a>//s; | |
$entry->{'html'} =~ s/(?:(?:\s*<br \/>\s*)+|\s+)$//s; | |
$entry->{'attatched_image'}->{'image_url'} = delete($entry->{'image'}) if ($entry->{'image'}); | |
$entry->{'attatched_image'}->{'thumbnail_url'} = delete($entry->{'thumbnail'}) if ($entry->{'thumbnail'}); | |
$entry->{'user'} = {}; | |
$entry->{'user'}->{'id'} = $entry->{'screen_name'}; | |
$entry->{'user'}->{'name'} = delete($entry->{'name'}); | |
$entry->{'user'}->{'screen_name'} = delete($entry->{'screen_name'}); | |
$entry->{'created_at'} = gmtime(HTTP::Date::str2time($entry->{'created_at'}, '+0900')); | |
$entry->{'created_at'} =~ s/ (\d{4})$/ +0000 $1/; | |
$entry->{'id'} = $1 if ($entry->{'url'} =~ m{/statuses/([^/]*)}); | |
delete($entry->{'url'}); | |
if ($entry->{'in_reply_to'}) { | |
if ($entry->{'in_reply_to'} =~ s/^(.+) by (.*?[^ ])$//) { | |
$entry->{'in_reply_to_text'} = $1; | |
$entry->{'in_reply_to_name'} = $2; | |
} | |
delete($entry->{'in_reply_to'}); | |
} | |
if ($entry->{'in_reply_to_url'}) { | |
if ($entry->{'in_reply_to_url'} =~ m{/user/(.*?)/statuses/([^/]*)}) { | |
$entry->{'in_reply_to_user_id'} = $1; | |
$entry->{'in_reply_to_screen_name'} = $1; | |
$entry->{'in_reply_to_status_id'} = $2; | |
} | |
delete($entry->{'in_reply_to_url'}); | |
} | |
if ($entry->{'favorited_status'} && scalar @{$entry->{'favorited_status'}}) { | |
foreach my $favorited (@{$entry->{'favorited_status'}}) { | |
my $user = {}; | |
$user->{'profile_image_url'} = delete($favorited->{'image'}); | |
$user->{'screen_name'} = delete($favorited->{'title'}); | |
$user->{'id'} = $user->{'screen_name'}; | |
delete($favorited->{'url'}); | |
$favorited->{'user'} = $user; | |
} | |
} | |
return $entry; | |
} | |
sub _scraper_wassr { | |
my $self = shift; | |
my $url = shift || $self->ua->uri || 'http://wassr.jp/'; | |
my $resolver = sub { my $rl = HTML::ResolveLink->new(base => $url ); return $rl->resolve($_[0]); }; | |
$self->{'scraper_wassr'} ||= scraper { | |
process 'div.MsgBody', 'entries[]' => scraper { | |
process 'p.description', 'description' => ['HTML', $resolver]; | |
process 'a.thickbox', 'image' => '@href'; | |
process 'a.thickbox img', 'thumbnail' => '@src'; | |
process 'span.res a', 'in_reply_to' => ['HTML', $resolver], 'in_reply_to_url' => '@href'; | |
process 'div.favorite_list a', 'favorited_status[]' => scraper { | |
process 'a', 'title' => '@title', 'url' => '@href'; | |
process 'img', 'image' => '@src'; | |
}; | |
process 'a.MsgUserName', 'name' => 'TEXT', 'screen_name' => '@title'; | |
process 'a.MsgDateTime', 'created_at' => 'TEXT', 'url' => '@href', 'text' => '@title'; | |
process '//p[@class="messagefoot_info"]//a[not(@class)]', 'source' => 'TEXT'; | |
}; | |
process 'id("PagerNextLink")', 'next' => '@href'; | |
process 'id("PagerPrevLink")', 'prev' => '@href'; | |
}; | |
return $self->{'scraper_wassr'}; | |
} | |
1; | |
=head1 NAME | |
WebScraper::Wassr::Timeline - Scraping Wassr timeline. | |
=head1 VERSION | |
Version 0.01 | |
=cut | |
our $VERSION = '0.01'; | |
=head1 SYNOPSIS | |
use WebScraper::Wassr::Timeline; | |
use YAML; | |
my $wassr = WebScraper::Wassr::Timeline->new({'user' => 'yourname', 'password' => 'opensesami'}); | |
$wassr->login; | |
foreach my $page (1..5) { | |
my $entries = $wassr->user_timeline({ | |
'user' => 'friendname', # default is you. | |
'page' => $page # default is first page. | |
}); | |
print YAML::Dump($entries); | |
last if (not $wassr->next_url); | |
} | |
=head1 FUNCTIONS | |
=head2 new | |
Constructor. | |
=head2 login | |
Login to wassr with 'user' and 'password'. | |
=head2 user_timeline | |
Fetch user timeline. You can specify user id as 'user', page number as 'page'. | |
=head2 next_url | |
Returns 'next' page url. | |
=head2 prev_url | |
Returns 'prev' (means previous) page url. | |
=head2 user | |
Accessor for 'user' parameter used to wassr authentication. | |
=head2 password | |
Accessor for 'password' parameter used to wassr authentication. | |
=head2 ua | |
Accessor for 'ua' parameter contains WWW::Mechanize object used to access wassr. | |
=head1 AUTHOR | |
Makio Tsukamoto, C<< <tsukamoto at gmail.com> >> | |
=head1 COPYRIGHT & LICENSE | |
Copyright 2010 Makio Tsukamoto, all rights reserved. | |
This program is free software; you can redistribute it and/or modify it | |
under the same terms as Perl itself. | |
=cut | |
1; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment