Last active
January 1, 2020 05:30
-
-
Save ilius/44a39c3c9686c7a1af3a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
=dependencies: | |
sudo cpan get Mojolicious | |
sudo cpan get MongoDB | |
sudo cpan get Set::Light | |
sudo cpan get Log::Log4perl | |
sudo cpan get DateTime::Format::Strptime | |
sudo cpan get File::HomeDir | |
sudo cpan get Digest::SHA1 | |
=cut | |
package LTCrawler; | |
use 5.010; | |
use open qw(:locale); | |
use strict; | |
use utf8; | |
use warnings qw(all); | |
use Mojo::UserAgent; | |
use MongoDB; | |
use Set::Light; | |
use Log::Log4perl; | |
use DateTime::Format::Strptime; | |
use File::HomeDir; | |
use File::Path qw( make_path ); | |
use Digest::SHA1 qw( sha1_hex ); | |
use DateTime; | |
use Data::Dumper; | |
$, = " ";# output field seperator | |
$Data::Dumper::Terse = 1; | |
$Data::Dumper::Indent = 1;## ENUM, NOT INDENTATION SIZE | |
$Data::Dumper::Pair = ': ';## Pythonic | |
$Data::Dumper::Sortkeys = 1; | |
###################### Configuration ########################## | |
my $log_level = "TRACE"; ## TRACE, DEBUG, INFO, WARN, ERROR, FATAL | |
my $max_connections = 10;## Limit parallel connections | |
## Controlling Recurtion: $recur->{page_type1}->{page_type2} | |
my $recur = { | |
artist => { | |
song => 1,## Working | |
translation => 0,## Not Implemented | |
}, | |
translator => { | |
topic => 1, ## Not Implemented, songs, translations and requests added by user | |
translation => 0,## Not Implemented, translations added by user | |
request => 0, ## Not Implemented, requests added by user | |
transcription => 0, ## Not Implemented, transcriptions added by user | |
artist => 1,## Not Implemented, artists added by user | |
comment => 1, ## Not Implemented, comments added by user | |
vote => 1, ## Not Implemented, votes added by user, tab "Voted" | |
}, | |
song => { | |
translation => 1,## Working | |
submitter => 1,## Working | |
}, | |
translation => { | |
translator => 1,## Working | |
}, | |
}; | |
# FIFO queue | |
my @urls = map { Mojo::URL->new($_) } qw( | |
http://lyricstranslate.com/en/andrea-bocelli-lyrics.html | |
); | |
=comment | |
http://lyricstranslate.com/en/fiorini-lando-lyrics.html | |
http://lyricstranslate.com/en/andrea-bocelli-lyrics.html | |
http://lyricstranslate.com/en/translator/evfokas | |
http://lyricstranslate.com/en/translator/aldefina | |
=cut | |
my $db_name = 'LyricsTranslate'; | |
my $db_host = 'localhost'; | |
my $db_port = 27017; | |
my $LANG = "en"; | |
my $SITE = "http://lyricstranslate.com"; | |
my $HOME = File::HomeDir->my_home; | |
my $SAVE_DIR = "$HOME/LyricsTranslate"; | |
my $CACHE_DIR = "$SAVE_DIR/cache"; | |
my $use_title_filename = 1; | |
my $hash_dir_name_length = 2; | |
my $log_conf = " | |
log4perl.category.LTCrawler = $log_level, Logfile, Screen | |
log4perl.appender.Logfile = Log::Log4perl::Appender::File | |
log4perl.appender.Logfile.filename = $SAVE_DIR/log | |
log4perl.appender.Logfile.layout = Log::Log4perl::Layout::PatternLayout | |
log4perl.appender.Logfile.layout.ConversionPattern = [\%r] \%F \%L \%m\%n | |
log4perl.appender.Screen = Log::Log4perl::Appender::Screen | |
log4perl.appender.Screen.stderr = 0 | |
log4perl.appender.Screen.layout = Log::Log4perl::Layout::SimpleLayout | |
"; | |
######################## Initialization ########################## | |
for my $dir ($SAVE_DIR, $CACHE_DIR) { | |
if ( !-d $dir ) { | |
make_path $dir or die "Failed to create directory: $dir"; | |
} | |
} | |
Log::Log4perl::init(\$log_conf);## passed as a reference to init() | |
my $log = Log::Log4perl->get_logger("LTCrawler"); | |
my $active = 0;## Keep track of active connections | |
# User agent following up to 5 redirects | |
my $ua = Mojo::UserAgent->new(max_redirects => 5); | |
$ua->proxy->detect; | |
## $ua is a Mojo::UserAgent object | |
my $done_urls = Set::Light->new(); | |
my $date_dot_parser = DateTime::Format::Strptime->new( | |
pattern => '%d.%m.%Y', | |
time_zone => 'UTC', | |
); | |
my $datetime_parser = DateTime::Format::Strptime->new( | |
pattern => '%d/%m/%Y - %H:%M', | |
time_zone => 'UTC', | |
); | |
my $client = MongoDB::MongoClient->new(host => $db_host, port => $db_port); | |
my $db = $client->get_database($db_name); | |
my @col_names = ( | |
'song', | |
'translation', | |
'artist', | |
'page', | |
'translator', | |
); | |
my $cols = {}; | |
for my $name (@col_names) { | |
$cols->{$name} = $db->get_collection($name); | |
}; | |
#=comment: indexes | |
$cols->{song}->ensure_index({'submit.date' => 1}); | |
$cols->{song}->ensure_index({'submit.user_id' => 1}); | |
$cols->{song}->ensure_index({'submit.user_page' => 1}); | |
$cols->{song}->ensure_index({'submit.username' => 1}); | |
$cols->{song}->ensure_index({'last_edit.date' => 1}); | |
$cols->{song}->ensure_index({'last_edit.user_id' => 1}); | |
$cols->{song}->ensure_index({'last_edit.user_page' => 1}); | |
$cols->{song}->ensure_index({'last_edit.username' => 1}); | |
$cols->{song}->ensure_index({'artist.page' => 1}); | |
$cols->{song}->ensure_index({'title' => 1}); | |
$cols->{page}->ensure_index({'type' => 1}); | |
#=cut | |
my $translator_roles_value = { | |
'Novice' => 0, | |
'Junior Member' => 1, | |
'Member' => 2, | |
'Senior Member' => 3, | |
'Super Member' => 4, | |
'Editor' => 5, | |
'Moderator' => 6, | |
'Retired Moderator' => 7, | |
}; | |
####################### Event Loop ############################ | |
Mojo::IOLoop->recurring( | |
0.01 => sub { | |
my $loop = shift; | |
for ($active + 1 .. $max_connections) { | |
my $url = shift @urls; | |
if (not $url){ | |
if ($active < 1) { | |
Mojo::IOLoop->stop; | |
} | |
return; | |
} | |
# Fetch non-blocking just by adding | |
# a callback and marking as active | |
++$active; $ua->get($url => \&get_callback); | |
} | |
} | |
); | |
# Start event loop if necessary | |
Mojo::IOLoop->start unless Mojo::IOLoop->is_running; | |
##################### Functions ############################## | |
sub status { | |
$log->debug( | |
scalar @urls . " Pending URLs, " . $done_urls->size() . " Done URLs, $active Running" | |
); | |
} | |
sub add_page { | |
my $page = shift;## no language code | |
if (not $page){ | |
return; | |
} | |
if ($page =~ m/.*user\/login.*/){ | |
return; | |
} | |
my $url = "$SITE/$LANG/$page"; | |
$log->trace($url); | |
if ($done_urls->has($url)){ return; } | |
push @urls, $url; | |
status; | |
} | |
sub get_href_page { | |
my $page = shift->attr('href') =~ s|^/../||r; | |
if ($page =~ m/.*user\/login.*/ ) { | |
return ""; | |
} | |
return $page;## no language code | |
} | |
sub get_page {return shift->path->to_string =~ s|^/../||r;}## no language code | |
sub db_save { | |
my $col_name = shift; | |
my $doc = shift; | |
my $tm = DateTime->now; | |
$doc->{crawl_time} = $tm; | |
$cols->{$col_name}->save($doc); | |
$cols->{page}->save({ | |
_id => $doc->{_id}, | |
type => $col_name, | |
crawl_time => $tm, | |
}); | |
} | |
sub get_callback { | |
my $ua2 = shift; | |
my $tx = shift; | |
if (not $tx){ | |
$log->error("undefined \$tx"); | |
return; | |
} | |
my $url = $tx->req->url; | |
$done_urls->insert($url->to_string); | |
if (not $tx->res){ | |
$log->error("undefined \$tx->res"); | |
} elsif ($tx->res->is_empty){ | |
$log->error("empty \$tx->res"); | |
#} elsif (not $tx->res->code){ | |
# $log->error("undefined \$tx->res->code, message=".$tx->res->message); | |
} elsif (not $tx->res->is_status_class(200)){ | |
$log->error("URL: $url , Response: " . $tx->res->to_string); | |
#$log->error("Result Code: " . $tx->res->code . ", URL: $url"); | |
#$log->error("Result Message: " . $tx->res->message . ", URL: $url"); | |
} elsif ($tx->res->headers->content_type !~ m{^text/html\b}ix){ | |
$log->error( "Content Type: " . $tx->res->headers->content_type); | |
} else { | |
save_cache_page($tx); | |
parse_page($tx); | |
} | |
status; | |
#$log->debug('result = ' . Dumper($result)); | |
--$active; | |
} | |
sub save_cache_page { | |
my $tx = shift; | |
my $url = $tx->req->url; | |
my $page = get_page($url); | |
my $fname = $page; | |
$fname =~ s/\//_/g;## replace other slashes with underlines | |
#$fname =~ s/\|/_/g; | |
#$fname =~ s/\./_/g; | |
#$fname =~ s/\s/_/g; | |
#$fname =~ s/_+/_/g; | |
if ($fname eq ""){ return; } | |
my $dirpath = "$CACHE_DIR/" . substr(sha1_hex($page), 0, $hash_dir_name_length); | |
make_path($dirpath); | |
my $fpath = "$dirpath/$fname"; | |
my $fh; | |
if (not open($fh, ">$fpath")){ | |
$log->error("failed to save file $fpath"); | |
return; | |
} | |
print $fh $tx->res->body; | |
close $fh; | |
$log->debug("saved file $fpath"); | |
} | |
sub parse_page { | |
my $tx = shift; | |
my $url = $tx->req->url; | |
my $page = get_page($url); | |
my $dom = $tx->res->dom; | |
#$log->debug("parse_page: $page"); | |
if ($url =~ m/.*\/translator\/.*/) { | |
return parse_translator_page($tx); | |
} | |
if ($url =~ m/.*\/request\/.*/) { | |
return;## Not Implemented | |
} | |
if ($dom->find('div.artist-node-info')->first){ | |
return parse_artist_page($tx); | |
} | |
if ($dom->find('div.translate-node-text')->first){ | |
return parse_translation_page($tx); | |
} | |
if ($dom->find('div.song-node-text')->first){ | |
return parse_song_page($tx); | |
} | |
} | |
sub parse_translator_page{ | |
my $tx = shift; | |
my $url = $tx->req->url; | |
my $page = get_page($url); | |
my $dom = $tx->res->dom; | |
my $username = $page =~ s/.*translator\///r; | |
my $translator = { | |
_id => $page, | |
username => $username, | |
accounts => {}, | |
}; | |
for my $field_div ($dom->find('div.uprofile-info-field')->each){ | |
if ($field_div =~ m/.*Send PM.*/){ | |
my $pm_a = $field_div->find('a')->first; | |
if ($pm_a){ | |
$translator->{user_id} = $pm_a->attr('href') =~ s|.*/new/||r + 0; | |
} | |
next; | |
} | |
my $value_div = $field_div->find('div.uprofile-value')->first; | |
if (not $value_div){ | |
$value_div = $field_div->find('div.uprofile-value-inline')->first; | |
if (not $value_div){ | |
$log->warn("no value element found for uprofile-info-field: $url\n$field_div"); | |
next; | |
} | |
} | |
my $value = $value_div->content; | |
if ($field_div->matches('.uprofile-info-icq')){ | |
$translator->{accounts}->{icq} = $value; | |
next; | |
} | |
if ($field_div->matches('.uprofile-info-msn')){ | |
$translator->{accounts}->{msn} = $value; | |
next; | |
} | |
if ($field_div->matches('.uprofile-info-google')){ | |
$translator->{accounts}->{google} = $value; | |
next; | |
} | |
if ($field_div->matches('.uprofile-info-skype')){ | |
$translator->{accounts}->{skype} = $value; | |
next; | |
} | |
if ($field_div->matches('.uprofile-info-yahoo')){ | |
$translator->{accounts}->{yahoo} = $value; | |
next; | |
} | |
my $label_div = $field_div->find('div.uprofile-label')->first; | |
if (not $label_div){ | |
$label_div = $field_div->find('div.h2title')->first; | |
if (not $label_div){ | |
$log->warn("no label element found for uprofile-info-field: $url\n$field_div"); | |
next; | |
} | |
} | |
my $attr = lc $label_div->text =~ s/ /_/r; | |
if ($attr eq 'badges'){ | |
$value = []; | |
for my $badge_img ($value_div->find('img')->each){ | |
push @$value, $badge_img->attr('title'); | |
} | |
} | |
$translator->{$attr} = $value; | |
} | |
my $joined = $translator->{joined}; | |
if ($joined){ | |
$translator->{joined} = $date_dot_parser->parse_datetime($joined); | |
} | |
my $role = $translator->{role}; | |
if ($role) { | |
$translator->{role_value} = $translator_roles_value->{$role}; | |
} | |
my $status_div = $dom->find('div.uprofile-status-icon')->first; | |
if ($status_div){ | |
$translator->{status} = $status_div->text; | |
}else{ | |
$translator->{status} = ""; | |
} | |
db_save('translator', $translator); | |
} | |
sub parse_song_page { | |
my $tx = shift; | |
my $url = $tx->req->url; | |
my $page = get_page($url); | |
my $dom = $tx->res->dom; | |
## $tx->req->url is Mojo::URL | |
## $tx->req->url->path is Mojo::Path | |
my $artist_li = $dom->find('li.song-node-info-artist')->first; | |
if (not $artist_li){ | |
$log->error("artist element 'li.song-node-info-artist' not found"); | |
return; | |
} | |
my $artist_a = $artist_li->find('a')->first; | |
if (not $artist_a){ | |
$log->error("artist page not found"); | |
return; | |
} | |
my $artist = { | |
name => $artist_a->text, | |
page => get_href_page($artist_a), | |
}; | |
my $div = $dom->find('div.song-node-text')->first; | |
if (not $div){ | |
$log->error("element 'div.song-node-text' not found"); | |
return | |
} | |
my $song = parse_song_trans_div($tx, $div); | |
if (not $song){ | |
$log->error("empty song"); | |
return; | |
} | |
$song->{_id} = $page; | |
$song->{artist} = $artist; | |
my $translation_list = []; | |
my $translation_list_li = $dom->find('li.song-node-info-translate')->[0]; | |
if ($translation_list_li){ | |
for my $a ($translation_list_li->find('a')->each) { | |
push @$translation_list, { | |
lang => $a->text, | |
page => get_href_page($a), | |
}; | |
} | |
} | |
$song->{translations} = $translation_list; | |
my $translation_request_list = []; | |
my $translation_request_list_li = $dom->find('li.song-node-info-translate')->[1]; | |
if ($translation_request_list_li){ | |
for my $a ($translation_request_list_li->find('a')->each) { | |
push @$translation_request_list, { | |
lang => $a->text, | |
page => get_href_page($a), | |
}; | |
} | |
} | |
$song->{translation_requests} = $translation_request_list; | |
db_save('song', $song); | |
if ($recur->{song}->{translation}){ | |
for my $trans_item (@$translation_list){ | |
add_page($trans_item->{page}); | |
} | |
} | |
if ($recur->{song}->{submitter}){ | |
add_page($song->{submit}->{user_page}) | |
} | |
} | |
sub parse_translation_page { | |
my $tx = shift; | |
my $url = $tx->req->url; | |
my $page = get_page($url); | |
my $dom = $tx->res->dom; | |
## $tx->req->url is Mojo::URL | |
## $tx->req->url->path is Mojo::Path | |
my $translation; | |
my $div = $dom->find('div.translate-node-text')->first; | |
if (not $div){ | |
$log->error("element 'div.translate-node-text' not found"); | |
} | |
$translation = parse_song_trans_div($tx, $div); | |
if (not $translation){ | |
return; | |
} | |
my $song_page = ""; | |
for my $song_title_li ($dom->find('li.song-node-info-album')->each) { | |
if ($song_title_li =~ m/Song:/){ | |
my $song_title_a = $song_title_li->find('a')->first; | |
if (not $song_title_a){ | |
next; | |
} | |
$song_page = get_href_page($song_title_a); | |
} | |
} | |
if (not $song_page){ | |
$log->error("parse_translation_page: song page not found, \$url=$url"); | |
return; | |
} | |
#### | |
$translation->{_id} = $page; | |
$translation->{song} = $song_page; | |
db_save('translation', $translation); | |
if ($recur->{translation}->{translator}){ | |
add_page($translation->{submit}->{user_page}); | |
} | |
} | |
sub parse_submit_user_date { | |
my $div = shift; | |
my $user_id = -1; | |
my $user_page = ""; | |
my $username = ""; | |
my $a = $div->find('a')->[0]; | |
if ($a){ | |
$user_id = $a->attr('id') =~ s|user||r + 0; | |
if($user_id eq 0){ | |
$user_page = ""; | |
$username = "guest"; | |
}else{ | |
$user_page = get_href_page($a); | |
$username = $user_page =~ s|.*/translator/||r; | |
} | |
} | |
my $date = $datetime_parser->parse_datetime($div->text =~ s|.* by.*, ||r); | |
return { | |
user_id => $user_id, | |
user_page => $user_page, | |
username => $username, | |
date => $date, | |
} | |
} | |
sub parse_song_trans_div { | |
my $tx = shift; | |
my $div = shift; | |
my $url = $tx->req->url; | |
my $title_el = $div->find('h2.title-h2')->first; | |
if (not $title_el){ | |
$log->error("parse_song_trans_div: title h2 element not found: $url"); | |
return; | |
} | |
my $title = $title_el->text; | |
if (not $title){ | |
my $title_span = $title_el->find('span')->first; | |
if($title_span){ | |
$title = $title_span->text; | |
} | |
} | |
if (not $title){ | |
$log->error("parse_song_trans_div: empty title: $url"); | |
return; | |
} | |
my @contents = (); | |
$title_el->following('p')->each(sub { | |
push @contents, shift->content; | |
}); | |
$title_el->following('div.par')->each(sub { | |
push @contents, shift->content; | |
}); | |
my $text = join("\n\n", @contents) . "\n"; | |
my $submit; | |
my $submit_div = $div->find('div.authorsubmitted')->first; | |
if ($submit_div){ | |
$submit = parse_submit_user_date($submit_div); | |
}else{ | |
$log->error("parse_song_trans_div: submit div not found: $url"); | |
return;## FIXME | |
} | |
my $last_edit; | |
my $last_edit_div = $div->find('div.authorsubmitted')->[1]; | |
if ($last_edit_div){ | |
$last_edit = parse_submit_user_date($last_edit_div); | |
} | |
my $copyright = ""; | |
my $copyright_div = $div->find('div.copyrighttext')->first; | |
if ($copyright_div){ | |
$copyright = $copyright_div->content; | |
} | |
my $authorcomment = ""; | |
my $authorcomment_div = $div->find('div.authorcomment')->first; | |
if ($authorcomment_div){ | |
$authorcomment = $authorcomment_div->content; | |
} | |
return { | |
title => $title, | |
text => $text, | |
submit => $submit, | |
last_edit => $last_edit, | |
copyright => $copyright, | |
authorcomment => $authorcomment, | |
}; | |
} | |
sub parse_artist_page { | |
my $tx = shift; | |
my $url = $tx->req->url; | |
my $page = get_page($url); | |
my $dom = $tx->res->dom; | |
my $header_div = $dom->find('div#content-header')->first; | |
if (not $header_div){ | |
$log->error("element 'div#content-header' not found"); | |
return; | |
} | |
my $h1 = $header_div->find('h1.title')->first; | |
if (not $h1){ | |
$log->error("parse_artist_page: h1.title element not found"); | |
return; | |
} | |
my $name = $h1->text =~ s/ lyrics$//r; | |
my $lang_div = $dom->find('div.breadcrumb-node')->first; | |
if (not $lang_div){ | |
$log->error("element 'div.breadcrumb-node' not found"); | |
return; | |
} | |
my $lang_list = []; | |
for my $lang_a ($lang_div->find('a')->each){ | |
my $lang_page = get_href_page($lang_a); | |
if ($lang_page !~ m|^language/.*|) { | |
if ($lang_page ne $SITE){ | |
$log->error("invalid language href: $lang_page, URL: $url"); | |
} | |
next; | |
} | |
push @$lang_list, { | |
name => $lang_a->text, | |
page => $lang_page, | |
}; | |
} | |
my $orig_name_li = $dom->find('li.artist-node-info-name')->first; | |
my $orig_name = ''; | |
if ($orig_name_li){ | |
$orig_name = $orig_name_li->text =~ s/Original name: //r; | |
} | |
my $country_li = $dom->find('li.artist-node-info-country')->first; | |
if (not $country_li){ | |
$log->error("parse_artist_page: country element not found: $url"); | |
return; | |
} | |
my $country_a = $country_li->find('a')->first; | |
if (not $country_a){ | |
$log->error("parse_artist_page: country page not found: $url"); | |
return; | |
} | |
my $country_name = $country_a->text; | |
my $country_page = get_href_page($country_a); | |
my $genre_li = $dom->find('li.artist-node-info-genre')->first; | |
my $genre_list = []; | |
if ($genre_li){ | |
$genre_li->find('a')->each(sub { | |
my $a = shift; | |
push @$genre_list, { | |
name => $a->text, | |
page => get_href_page($a), | |
} | |
}); | |
} | |
my $site_li = $dom->find('li.artist-node-info-site')->first; | |
my $site = ""; | |
if ($site_li){ | |
my $site_a = $site_li->find('a')->first; | |
if ($site_a){ | |
$site = $site_a->attr('href'); | |
} | |
} | |
my $wiki_li = $dom->find('li.artist-node-info-wiki')->first; | |
my $wiki = ""; | |
if ($wiki_li){ | |
my $wiki_a = $wiki_li->find('a')->first; | |
if ($wiki_a){ | |
$wiki = $wiki_a->attr('href'); | |
} | |
} | |
my $image_div = $dom->find('div.artist-node-img')->first; | |
my $image = ""; | |
if ($image_div){ | |
my $image_img = $image_div->find('img')->first; | |
if ($image_img) { | |
$image = $image_img->attr('src'); | |
} | |
} | |
my $songs_div = $dom->find('div#artistnodesonglist')->first; | |
if (not $songs_div) { | |
$log->error("element 'table#artistsonglist' not found: $url"); | |
return; | |
} | |
my $song_pages = artist_get_song_pages($songs_div); | |
my $feat_songs_div = $songs_div->following('div.artist-node-songlist')->[0]; | |
my $feat_song_pages = artist_get_song_pages($feat_songs_div); | |
my $performed_songs_div = $songs_div->following('div.artist-node-songlist')->[1]; | |
my $performed_song_pages = artist_get_song_pages($performed_songs_div); | |
if ($recur->{artist}->{song}) { | |
for my $song_page (@$song_pages, @$feat_song_pages, @$performed_song_pages){ | |
add_page($song_page); | |
} | |
} | |
if ($recur->{artist}->{translation}) { | |
## Not Implemented | |
} | |
db_save('artist', { | |
_id => $page, | |
name => $name, | |
orig_name => $orig_name, | |
lang_list => $lang_list, | |
country => { | |
name => $country_name, | |
page => $country_page, | |
}, | |
genre_list => $genre_list, | |
wiki => $wiki, | |
image => $image, | |
songs => $song_pages, | |
feat_songs => $feat_song_pages, | |
performed_songs => $performed_song_pages, | |
}); | |
} | |
sub artist_get_song_pages{ | |
my ($parent_element) = @_; | |
if (not $parent_element){ | |
return []; | |
} | |
my $song_pages = []; | |
for my $song_td ($parent_element->find('td.songName')->each){ | |
my $song_a = $song_td->find('a')->first; | |
if (not $song_a){ | |
next; | |
} | |
push @$song_pages, get_href_page($song_a); | |
} | |
return $song_pages; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment