Last active
July 13, 2016 15:26
-
-
Save klopp/baef16317ddfc7e321745480b9ab1167 to your computer and use it in GitHub Desktop.
Тестовое задание
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
# ------------------------------------------------------------------------------ | |
# Написать скрипт, который принимает в аргументах url с сайтом и делает | |
# рекурсивный обход всех страниц этого сайта (как указанного в аргументе, | |
# так и его поддомены, если они будут встречаться) и выводит их в виде дерева | |
# на stdout. | |
# | |
# Необходимо использовать Mojo::IOLoop, Mojo::UserAgent и Mojo::DOM для решения. | |
# | |
# Всё должно обрабатываться в одном процессе, одновременных запросов к любому | |
# домену должно быть не более 6. | |
# ------------------------------------------------------------------------------ | |
use Modern::Perl; | |
use Mojo::UserAgent; | |
use Mojo::IOLoop; | |
use Mojo::DOM; | |
use Mojo::URL; | |
use Const::Fast; | |
# ------------------------------------------------------------------------------ | |
# Devel stuff: | |
# ------------------------------------------------------------------------------ | |
use Carp qw/confess/; | |
use Data::Printer; | |
# ------------------------------------------------------------------------------ | |
const my $DEPTH_LIMIT => 3; | |
const my $PAGES_PER_DOMAIN => 10; | |
const my $THREADS_LIMIT => 6; | |
const my $VALID_PROTO => qr/^http/; | |
const my $DEFAULT_PROTO => 'http'; # for //links | |
const my $DEBUG => 0; | |
const my %UA_OPTIONS => ( max_redirects => 5 ); | |
# ------------------------------------------------------------------------------ | |
my %url_tree; | |
my @queue; | |
my %urls; | |
my %domain_pages; | |
# ------------------------------------------------------------------------------ | |
$ARGV[0] = 'http://google.com' if $DEBUG; | |
$ARGV[0] or _usage(); | |
my $baseurl = Mojo::URL->new( $ARGV[0] )->to_abs(); | |
my $basehost = $baseurl->host(); | |
$basehost or _usage(); | |
$basehost = qr/$basehost$/; | |
push @queue, $ARGV[0]; | |
my $ua = Mojo::UserAgent->new(%UA_OPTIONS); | |
my $spider; | |
my $active_requests = 0; | |
$spider = sub { | |
my $url = shift @queue; | |
unless ($url) { | |
return $active_requests ? undef : Mojo::IOLoop->stop(); | |
} | |
$ua->get( | |
$url => sub { | |
my ( $_ua, $tx ) = @_; | |
return unless $tx->success(); | |
$active_requests++; | |
my $added = 0; | |
say $url if $DEBUG; | |
$tx->res->dom('a[href]')->each( | |
sub { | |
my ( $e, $_id ) = @_; | |
say '>> ' . $e->{href} if $DEBUG; | |
my $url | |
= MURL->new( $e->{href} )->to_abs( $tx->req->url ); | |
my $host = $url->host(); | |
my $proto = $url->protocol() || $DEFAULT_PROTO; | |
if ( $host && $host =~ /^www\./ ) { | |
$host =~ s/^www\.//; | |
$url->host($host); | |
} | |
if ( $host | |
&& $proto | |
&& $host =~ $basehost | |
&& $proto =~ $VALID_PROTO | |
&& $url->depth() <= $DEPTH_LIMIT ) | |
{ | |
my $geturl = $url->opaque(); | |
$urls{$geturl} ||= 0; | |
$domain_pages{$host} ||= 0; | |
$active_requests--, return if $urls{$geturl}; | |
$urls{$geturl} = 1; | |
$domain_pages{$host}++; | |
$active_requests--, return | |
if $domain_pages{$host} > $PAGES_PER_DOMAIN; | |
say '>> ' . $url->depth() . " -> $url" if $DEBUG; | |
$added++; | |
push @queue, $proto . ':' . $geturl; | |
_put_to_tree($url); | |
} | |
} | |
); | |
$active_requests--; | |
$spider->() if $added; | |
} | |
); | |
}; | |
$spider->() for 1 .. $THREADS_LIMIT; | |
Mojo::IOLoop->start(); | |
_print_tree( \%url_tree, 0 ); | |
# ------------------------------------------------------------------------------ | |
sub _print_tree { | |
my ( $branch, $level ) = @_; | |
foreach my $part ( sort { length $a <=> length $b } keys %{$branch} ) { | |
if ($level) { | |
print ' ' x $level; | |
say "/$part"; | |
} | |
else { | |
say "\n$part"; | |
} | |
_print_tree( $branch->{$part}, $level + 1 ) | |
if defined $branch->{$part}; | |
} | |
} | |
# ------------------------------------------------------------------------------ | |
sub _put_to_tree { | |
my ($url) = @_; | |
my $path = $url->path(); | |
$path =~ s/\/+$//; | |
my $host = $url->host(); | |
$url_tree{$host} = undef unless exists $url_tree{$host}; | |
if ($path) { | |
$url_tree{$host} = {} unless $url_tree{$host}; | |
my @parts = split( /\/+/, $path ); | |
my $branch = $url_tree{$host}; | |
$parts[$#parts] .= $url->query() if $url->query(); | |
foreach my $part (@parts) { | |
next unless $part; | |
$branch->{$part} ||= {}; | |
$branch = $branch->{$part}; | |
} | |
} | |
} | |
# ------------------------------------------------------------------------------ | |
sub _usage { | |
die "Usage: $0 base_url\n"; | |
} | |
# ------------------------------------------------------------------------------ | |
package MURL; | |
use Modern::Perl; | |
use Mojo::URL; | |
use base qw/Mojo::URL/; | |
# ------------------------------------------------------------------------------ | |
sub opaque { | |
my ($self) = @_; | |
my $url = Mojo::URL->new($self); | |
$url->fragment('') if $url->fragment(); | |
$url->scheme(''); | |
return $url; | |
} | |
# ------------------------------------------------------------------------------ | |
sub depth { | |
my ($self) = @_; | |
my $path = $self->path()->to_abs_string(); | |
$path =~ s/\/+$//; | |
my $depth = split( /[^\/]+/, $path ); | |
return $depth; | |
} | |
# ------------------------------------------------------------------------------ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment