Skip to content

Instantly share code, notes, and snippets.

@mtsukamoto
Created February 4, 2010 13:42
Show Gist options
  • Save mtsukamoto/294617 to your computer and use it in GitHub Desktop.
Save mtsukamoto/294617 to your computer and use it in GitHub Desktop.
package WebScraper::Wassr::Timeline;
use warnings;
use strict;
use base qw(Class::Accessor::Fast);
use Carp;
use HTML::ResolveLink;
use URI::Escape;
use WWW::Mechanize;
use Web::Scraper;
WebScraper::Wassr::Timeline->mk_accessors(qw(user password next_url prev_url));
sub ua {
my $self = shift;
unless ($self->{'ua'}) {
$self->{'ua'} = WWW::Mechanize->new;
$self->{'ua'}->env_proxy;
}
return $self->{'ua'};
}
sub login {
my $self = shift;
return 0 if (not $self->user || not $self->password);
my $ua = $self->ua;
$ua->get('http://wassr.jp/')->is_success || die $ua->res->status_line;
return 0 if (not $ua->res && $ua->res->is_success);
return 0 if (not $ua->form_name('LoginForm'));
$ua->set_fields('login_id' => $self->user, 'login_pw' => $self->password);
$ua->submit_form->is_success || die $ua->res->status_line;
my $scraper = scraper {
process 'id("loginuser")', 'message' => 'TEXT';
process 'id("login_error_message")', 'error' => 'TEXT';
};
my $result = $scraper->scrape($ua->res);
my $error = 'Login failed' if (not $result->{'message'});
$error = $result->{'error'} if ($result->{'error'});
if ($error) {
$error =~ s/\s+/ /g;
die $error;
}
return $result->{'message'};
}
sub user_timeline {
my $self = shift;
my $args = shift || {};
my $user = $args->{'user'} || $self->user;
my $page = $args->{'page'} || 1;
my $url = $args->{'url'};
die "Page must be a number, '$page' is invalid." if ($page !~ /^\d+$/);
my $ua = $self->ua;
$url ||= sprintf('http://wassr.jp/user/%s?page=%s&ajax_response=0', uri_escape_utf8($user), $page);
$ua->get($url)->is_success || die $ua->res->status_line;
my $result = $self->_parse($ua->content, $ua->uri);
my $entries = $result->{'entries'};
die "Something wrong in parsing on '$url'." if (not ref($entries) eq 'ARRAY');
die "No entry exists on '$url'." if (not scalar @$entries);
$self->next_url($result->{'next'});
$self->prev_url($result->{'prev'});
return $result;
}
sub _parse {
my $self = shift;
my $content = shift;
my $url = shift;
my $scraper = $self->_scraper_wassr($url);
my $result = $scraper->scrape($content, $url);
foreach my $entry (@{$result->{'entries'}}) {
$entry->{'created_at'} =~ s/\(.*\)//;
$entry = _url_abs_stringify($entry, $url);
$entry = _entry_twitterify($entry, $url);
}
return $result;
}
sub _url_abs_stringify {
my $data = shift;
my $base = shift;
if (ref($data) eq 'ARRAY') {
$data->[$_] = _url_abs_stringify($data->[$_], $base) for (0..$#{$data});
} elsif (ref($data) eq 'HASH') {
$data->{$_} = _url_abs_stringify($data->{$_}, $base) for (keys(%{$data}));
} elsif (UNIVERSAL::isa($data, 'URI')) {
$data = $data->abs($base)->as_string;
}
return $data;
}
sub _entry_twitterify {
my $entry = shift;
$entry->{'html'} = delete($entry->{'description'});
$entry->{'html'} =~ s/^.*?<img alt="" class="inlineicon" height="16" src="http:\/\/wassr.jp\/img\/icn-balloon.gif" width="21" \/>//s;
$entry->{'html'} =~ s/\s*<a class="thickbox" .*?>.*?<\/a>//s;
$entry->{'html'} =~ s/(?:(?:\s*<br \/>\s*)+|\s+)$//s;
$entry->{'attatched_image'}->{'image_url'} = delete($entry->{'image'}) if ($entry->{'image'});
$entry->{'attatched_image'}->{'thumbnail_url'} = delete($entry->{'thumbnail'}) if ($entry->{'thumbnail'});
$entry->{'user'} = {};
$entry->{'user'}->{'id'} = $entry->{'screen_name'};
$entry->{'user'}->{'name'} = delete($entry->{'name'});
$entry->{'user'}->{'screen_name'} = delete($entry->{'screen_name'});
$entry->{'created_at'} = gmtime(HTTP::Date::str2time($entry->{'created_at'}, '+0900'));
$entry->{'created_at'} =~ s/ (\d{4})$/ +0000 $1/;
$entry->{'id'} = $1 if ($entry->{'url'} =~ m{/statuses/([^/]*)});
delete($entry->{'url'});
if ($entry->{'in_reply_to'}) {
if ($entry->{'in_reply_to'} =~ s/^(.+) by (.*?[^ ])$//) {
$entry->{'in_reply_to_text'} = $1;
$entry->{'in_reply_to_name'} = $2;
}
delete($entry->{'in_reply_to'});
}
if ($entry->{'in_reply_to_url'}) {
if ($entry->{'in_reply_to_url'} =~ m{/user/(.*?)/statuses/([^/]*)}) {
$entry->{'in_reply_to_user_id'} = $1;
$entry->{'in_reply_to_screen_name'} = $1;
$entry->{'in_reply_to_status_id'} = $2;
}
delete($entry->{'in_reply_to_url'});
}
if ($entry->{'favorited_status'} && scalar @{$entry->{'favorited_status'}}) {
foreach my $favorited (@{$entry->{'favorited_status'}}) {
my $user = {};
$user->{'profile_image_url'} = delete($favorited->{'image'});
$user->{'screen_name'} = delete($favorited->{'title'});
$user->{'id'} = $user->{'screen_name'};
delete($favorited->{'url'});
$favorited->{'user'} = $user;
}
}
return $entry;
}
sub _scraper_wassr {
my $self = shift;
my $url = shift || $self->ua->uri || 'http://wassr.jp/';
my $resolver = sub { my $rl = HTML::ResolveLink->new(base => $url ); return $rl->resolve($_[0]); };
$self->{'scraper_wassr'} ||= scraper {
process 'div.MsgBody', 'entries[]' => scraper {
process 'p.description', 'description' => ['HTML', $resolver];
process 'a.thickbox', 'image' => '@href';
process 'a.thickbox img', 'thumbnail' => '@src';
process 'span.res a', 'in_reply_to' => ['HTML', $resolver], 'in_reply_to_url' => '@href';
process 'div.favorite_list a', 'favorited_status[]' => scraper {
process 'a', 'title' => '@title', 'url' => '@href';
process 'img', 'image' => '@src';
};
process 'a.MsgUserName', 'name' => 'TEXT', 'screen_name' => '@title';
process 'a.MsgDateTime', 'created_at' => 'TEXT', 'url' => '@href', 'text' => '@title';
process '//p[@class="messagefoot_info"]//a[not(@class)]', 'source' => 'TEXT';
};
process 'id("PagerNextLink")', 'next' => '@href';
process 'id("PagerPrevLink")', 'prev' => '@href';
};
return $self->{'scraper_wassr'};
}
1;
=head1 NAME
WebScraper::Wassr::Timeline - Scraping Wassr timeline.
=head1 VERSION
Version 0.01
=cut
our $VERSION = '0.01';
=head1 SYNOPSIS
use WebScraper::Wassr::Timeline;
use YAML;
my $wassr = WebScraper::Wassr::Timeline->new({'user' => 'yourname', 'password' => 'opensesami'});
$wassr->login;
foreach my $page (1..5) {
my $entries = $wassr->user_timeline({
'user' => 'friendname', # default is you.
'page' => $page # default is first page.
});
print YAML::Dump($entries);
last if (not $wassr->next_url);
}
=head1 FUNCTIONS
=head2 new
Constructor.
=head2 login
Login to wassr with 'user' and 'password'.
=head2 user_timeline
Fetch user timeline. You can specify user id as 'user', page number as 'page'.
=head2 next_url
Returns 'next' page url.
=head2 prev_url
Returns 'prev' (means previous) page url.
=head2 user
Accessor for 'user' parameter used to wassr authentication.
=head2 password
Accessor for 'password' parameter used to wassr authentication.
=head2 ua
Accessor for 'ua' parameter contains WWW::Mechanize object used to access wassr.
=head1 AUTHOR
Makio Tsukamoto, C<< <tsukamoto at gmail.com> >>
=head1 COPYRIGHT & LICENSE
Copyright 2010 Makio Tsukamoto, all rights reserved.
This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.
=cut
1;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment