Skip to content

Instantly share code, notes, and snippets.

@ilius
Last active January 1, 2020 05:30
Show Gist options
  • Save ilius/44a39c3c9686c7a1af3a to your computer and use it in GitHub Desktop.
Save ilius/44a39c3c9686c7a1af3a to your computer and use it in GitHub Desktop.
#!/usr/bin/env perl
=dependencies:
sudo cpan get Mojolicious
sudo cpan get MongoDB
sudo cpan get Set::Light
sudo cpan get Log::Log4perl
sudo cpan get DateTime::Format::Strptime
sudo cpan get File::HomeDir
sudo cpan get Digest::SHA1
=cut
package LTCrawler;
use 5.010;
use open qw(:locale);
use strict;
use utf8;
use warnings qw(all);
use Mojo::UserAgent;
use MongoDB;
use Set::Light;
use Log::Log4perl;
use DateTime::Format::Strptime;
use File::HomeDir;
use File::Path qw( make_path );
use Digest::SHA1 qw( sha1_hex );
use DateTime;
use Data::Dumper;
$, = " ";# output field seperator
$Data::Dumper::Terse = 1;
$Data::Dumper::Indent = 1;## ENUM, NOT INDENTATION SIZE
$Data::Dumper::Pair = ': ';## Pythonic
$Data::Dumper::Sortkeys = 1;
###################### Configuration ##########################
my $log_level = "TRACE"; ## TRACE, DEBUG, INFO, WARN, ERROR, FATAL
my $max_connections = 10;## Limit parallel connections
## Controlling Recurtion: $recur->{page_type1}->{page_type2}
my $recur = {
artist => {
song => 1,## Working
translation => 0,## Not Implemented
},
translator => {
topic => 1, ## Not Implemented, songs, translations and requests added by user
translation => 0,## Not Implemented, translations added by user
request => 0, ## Not Implemented, requests added by user
transcription => 0, ## Not Implemented, transcriptions added by user
artist => 1,## Not Implemented, artists added by user
comment => 1, ## Not Implemented, comments added by user
vote => 1, ## Not Implemented, votes added by user, tab "Voted"
},
song => {
translation => 1,## Working
submitter => 1,## Working
},
translation => {
translator => 1,## Working
},
};
# FIFO queue
my @urls = map { Mojo::URL->new($_) } qw(
http://lyricstranslate.com/en/andrea-bocelli-lyrics.html
);
=comment
http://lyricstranslate.com/en/fiorini-lando-lyrics.html
http://lyricstranslate.com/en/andrea-bocelli-lyrics.html
http://lyricstranslate.com/en/translator/evfokas
http://lyricstranslate.com/en/translator/aldefina
=cut
my $db_name = 'LyricsTranslate';
my $db_host = 'localhost';
my $db_port = 27017;
my $LANG = "en";
my $SITE = "http://lyricstranslate.com";
my $HOME = File::HomeDir->my_home;
my $SAVE_DIR = "$HOME/LyricsTranslate";
my $CACHE_DIR = "$SAVE_DIR/cache";
my $use_title_filename = 1;
my $hash_dir_name_length = 2;
my $log_conf = "
log4perl.category.LTCrawler = $log_level, Logfile, Screen
log4perl.appender.Logfile = Log::Log4perl::Appender::File
log4perl.appender.Logfile.filename = $SAVE_DIR/log
log4perl.appender.Logfile.layout = Log::Log4perl::Layout::PatternLayout
log4perl.appender.Logfile.layout.ConversionPattern = [\%r] \%F \%L \%m\%n
log4perl.appender.Screen = Log::Log4perl::Appender::Screen
log4perl.appender.Screen.stderr = 0
log4perl.appender.Screen.layout = Log::Log4perl::Layout::SimpleLayout
";
######################## Initialization ##########################
for my $dir ($SAVE_DIR, $CACHE_DIR) {
if ( !-d $dir ) {
make_path $dir or die "Failed to create directory: $dir";
}
}
Log::Log4perl::init(\$log_conf);## passed as a reference to init()
my $log = Log::Log4perl->get_logger("LTCrawler");
my $active = 0;## Keep track of active connections
# User agent following up to 5 redirects
my $ua = Mojo::UserAgent->new(max_redirects => 5);
$ua->proxy->detect;
## $ua is a Mojo::UserAgent object
my $done_urls = Set::Light->new();
my $date_dot_parser = DateTime::Format::Strptime->new(
pattern => '%d.%m.%Y',
time_zone => 'UTC',
);
my $datetime_parser = DateTime::Format::Strptime->new(
pattern => '%d/%m/%Y - %H:%M',
time_zone => 'UTC',
);
my $client = MongoDB::MongoClient->new(host => $db_host, port => $db_port);
my $db = $client->get_database($db_name);
my @col_names = (
'song',
'translation',
'artist',
'page',
'translator',
);
my $cols = {};
for my $name (@col_names) {
$cols->{$name} = $db->get_collection($name);
};
#=comment: indexes
$cols->{song}->ensure_index({'submit.date' => 1});
$cols->{song}->ensure_index({'submit.user_id' => 1});
$cols->{song}->ensure_index({'submit.user_page' => 1});
$cols->{song}->ensure_index({'submit.username' => 1});
$cols->{song}->ensure_index({'last_edit.date' => 1});
$cols->{song}->ensure_index({'last_edit.user_id' => 1});
$cols->{song}->ensure_index({'last_edit.user_page' => 1});
$cols->{song}->ensure_index({'last_edit.username' => 1});
$cols->{song}->ensure_index({'artist.page' => 1});
$cols->{song}->ensure_index({'title' => 1});
$cols->{page}->ensure_index({'type' => 1});
#=cut
my $translator_roles_value = {
'Novice' => 0,
'Junior Member' => 1,
'Member' => 2,
'Senior Member' => 3,
'Super Member' => 4,
'Editor' => 5,
'Moderator' => 6,
'Retired Moderator' => 7,
};
####################### Event Loop ############################
Mojo::IOLoop->recurring(
0.01 => sub {
my $loop = shift;
for ($active + 1 .. $max_connections) {
my $url = shift @urls;
if (not $url){
if ($active < 1) {
Mojo::IOLoop->stop;
}
return;
}
# Fetch non-blocking just by adding
# a callback and marking as active
++$active; $ua->get($url => \&get_callback);
}
}
);
# Start event loop if necessary
Mojo::IOLoop->start unless Mojo::IOLoop->is_running;
##################### Functions ##############################
sub status {
$log->debug(
scalar @urls . " Pending URLs, " . $done_urls->size() . " Done URLs, $active Running"
);
}
sub add_page {
my $page = shift;## no language code
if (not $page){
return;
}
if ($page =~ m/.*user\/login.*/){
return;
}
my $url = "$SITE/$LANG/$page";
$log->trace($url);
if ($done_urls->has($url)){ return; }
push @urls, $url;
status;
}
sub get_href_page {
my $page = shift->attr('href') =~ s|^/../||r;
if ($page =~ m/.*user\/login.*/ ) {
return "";
}
return $page;## no language code
}
sub get_page {return shift->path->to_string =~ s|^/../||r;}## no language code
sub db_save {
my $col_name = shift;
my $doc = shift;
my $tm = DateTime->now;
$doc->{crawl_time} = $tm;
$cols->{$col_name}->save($doc);
$cols->{page}->save({
_id => $doc->{_id},
type => $col_name,
crawl_time => $tm,
});
}
sub get_callback {
my $ua2 = shift;
my $tx = shift;
if (not $tx){
$log->error("undefined \$tx");
return;
}
my $url = $tx->req->url;
$done_urls->insert($url->to_string);
if (not $tx->res){
$log->error("undefined \$tx->res");
} elsif ($tx->res->is_empty){
$log->error("empty \$tx->res");
#} elsif (not $tx->res->code){
# $log->error("undefined \$tx->res->code, message=".$tx->res->message);
} elsif (not $tx->res->is_status_class(200)){
$log->error("URL: $url , Response: " . $tx->res->to_string);
#$log->error("Result Code: " . $tx->res->code . ", URL: $url");
#$log->error("Result Message: " . $tx->res->message . ", URL: $url");
} elsif ($tx->res->headers->content_type !~ m{^text/html\b}ix){
$log->error( "Content Type: " . $tx->res->headers->content_type);
} else {
save_cache_page($tx);
parse_page($tx);
}
status;
#$log->debug('result = ' . Dumper($result));
--$active;
}
sub save_cache_page {
my $tx = shift;
my $url = $tx->req->url;
my $page = get_page($url);
my $fname = $page;
$fname =~ s/\//_/g;## replace other slashes with underlines
#$fname =~ s/\|/_/g;
#$fname =~ s/\./_/g;
#$fname =~ s/\s/_/g;
#$fname =~ s/_+/_/g;
if ($fname eq ""){ return; }
my $dirpath = "$CACHE_DIR/" . substr(sha1_hex($page), 0, $hash_dir_name_length);
make_path($dirpath);
my $fpath = "$dirpath/$fname";
my $fh;
if (not open($fh, ">$fpath")){
$log->error("failed to save file $fpath");
return;
}
print $fh $tx->res->body;
close $fh;
$log->debug("saved file $fpath");
}
sub parse_page {
my $tx = shift;
my $url = $tx->req->url;
my $page = get_page($url);
my $dom = $tx->res->dom;
#$log->debug("parse_page: $page");
if ($url =~ m/.*\/translator\/.*/) {
return parse_translator_page($tx);
}
if ($url =~ m/.*\/request\/.*/) {
return;## Not Implemented
}
if ($dom->find('div.artist-node-info')->first){
return parse_artist_page($tx);
}
if ($dom->find('div.translate-node-text')->first){
return parse_translation_page($tx);
}
if ($dom->find('div.song-node-text')->first){
return parse_song_page($tx);
}
}
sub parse_translator_page{
my $tx = shift;
my $url = $tx->req->url;
my $page = get_page($url);
my $dom = $tx->res->dom;
my $username = $page =~ s/.*translator\///r;
my $translator = {
_id => $page,
username => $username,
accounts => {},
};
for my $field_div ($dom->find('div.uprofile-info-field')->each){
if ($field_div =~ m/.*Send PM.*/){
my $pm_a = $field_div->find('a')->first;
if ($pm_a){
$translator->{user_id} = $pm_a->attr('href') =~ s|.*/new/||r + 0;
}
next;
}
my $value_div = $field_div->find('div.uprofile-value')->first;
if (not $value_div){
$value_div = $field_div->find('div.uprofile-value-inline')->first;
if (not $value_div){
$log->warn("no value element found for uprofile-info-field: $url\n$field_div");
next;
}
}
my $value = $value_div->content;
if ($field_div->matches('.uprofile-info-icq')){
$translator->{accounts}->{icq} = $value;
next;
}
if ($field_div->matches('.uprofile-info-msn')){
$translator->{accounts}->{msn} = $value;
next;
}
if ($field_div->matches('.uprofile-info-google')){
$translator->{accounts}->{google} = $value;
next;
}
if ($field_div->matches('.uprofile-info-skype')){
$translator->{accounts}->{skype} = $value;
next;
}
if ($field_div->matches('.uprofile-info-yahoo')){
$translator->{accounts}->{yahoo} = $value;
next;
}
my $label_div = $field_div->find('div.uprofile-label')->first;
if (not $label_div){
$label_div = $field_div->find('div.h2title')->first;
if (not $label_div){
$log->warn("no label element found for uprofile-info-field: $url\n$field_div");
next;
}
}
my $attr = lc $label_div->text =~ s/ /_/r;
if ($attr eq 'badges'){
$value = [];
for my $badge_img ($value_div->find('img')->each){
push @$value, $badge_img->attr('title');
}
}
$translator->{$attr} = $value;
}
my $joined = $translator->{joined};
if ($joined){
$translator->{joined} = $date_dot_parser->parse_datetime($joined);
}
my $role = $translator->{role};
if ($role) {
$translator->{role_value} = $translator_roles_value->{$role};
}
my $status_div = $dom->find('div.uprofile-status-icon')->first;
if ($status_div){
$translator->{status} = $status_div->text;
}else{
$translator->{status} = "";
}
db_save('translator', $translator);
}
sub parse_song_page {
my $tx = shift;
my $url = $tx->req->url;
my $page = get_page($url);
my $dom = $tx->res->dom;
## $tx->req->url is Mojo::URL
## $tx->req->url->path is Mojo::Path
my $artist_li = $dom->find('li.song-node-info-artist')->first;
if (not $artist_li){
$log->error("artist element 'li.song-node-info-artist' not found");
return;
}
my $artist_a = $artist_li->find('a')->first;
if (not $artist_a){
$log->error("artist page not found");
return;
}
my $artist = {
name => $artist_a->text,
page => get_href_page($artist_a),
};
my $div = $dom->find('div.song-node-text')->first;
if (not $div){
$log->error("element 'div.song-node-text' not found");
return
}
my $song = parse_song_trans_div($tx, $div);
if (not $song){
$log->error("empty song");
return;
}
$song->{_id} = $page;
$song->{artist} = $artist;
my $translation_list = [];
my $translation_list_li = $dom->find('li.song-node-info-translate')->[0];
if ($translation_list_li){
for my $a ($translation_list_li->find('a')->each) {
push @$translation_list, {
lang => $a->text,
page => get_href_page($a),
};
}
}
$song->{translations} = $translation_list;
my $translation_request_list = [];
my $translation_request_list_li = $dom->find('li.song-node-info-translate')->[1];
if ($translation_request_list_li){
for my $a ($translation_request_list_li->find('a')->each) {
push @$translation_request_list, {
lang => $a->text,
page => get_href_page($a),
};
}
}
$song->{translation_requests} = $translation_request_list;
db_save('song', $song);
if ($recur->{song}->{translation}){
for my $trans_item (@$translation_list){
add_page($trans_item->{page});
}
}
if ($recur->{song}->{submitter}){
add_page($song->{submit}->{user_page})
}
}
sub parse_translation_page {
my $tx = shift;
my $url = $tx->req->url;
my $page = get_page($url);
my $dom = $tx->res->dom;
## $tx->req->url is Mojo::URL
## $tx->req->url->path is Mojo::Path
my $translation;
my $div = $dom->find('div.translate-node-text')->first;
if (not $div){
$log->error("element 'div.translate-node-text' not found");
}
$translation = parse_song_trans_div($tx, $div);
if (not $translation){
return;
}
my $song_page = "";
for my $song_title_li ($dom->find('li.song-node-info-album')->each) {
if ($song_title_li =~ m/Song:/){
my $song_title_a = $song_title_li->find('a')->first;
if (not $song_title_a){
next;
}
$song_page = get_href_page($song_title_a);
}
}
if (not $song_page){
$log->error("parse_translation_page: song page not found, \$url=$url");
return;
}
####
$translation->{_id} = $page;
$translation->{song} = $song_page;
db_save('translation', $translation);
if ($recur->{translation}->{translator}){
add_page($translation->{submit}->{user_page});
}
}
sub parse_submit_user_date {
my $div = shift;
my $user_id = -1;
my $user_page = "";
my $username = "";
my $a = $div->find('a')->[0];
if ($a){
$user_id = $a->attr('id') =~ s|user||r + 0;
if($user_id eq 0){
$user_page = "";
$username = "guest";
}else{
$user_page = get_href_page($a);
$username = $user_page =~ s|.*/translator/||r;
}
}
my $date = $datetime_parser->parse_datetime($div->text =~ s|.* by.*, ||r);
return {
user_id => $user_id,
user_page => $user_page,
username => $username,
date => $date,
}
}
sub parse_song_trans_div {
my $tx = shift;
my $div = shift;
my $url = $tx->req->url;
my $title_el = $div->find('h2.title-h2')->first;
if (not $title_el){
$log->error("parse_song_trans_div: title h2 element not found: $url");
return;
}
my $title = $title_el->text;
if (not $title){
my $title_span = $title_el->find('span')->first;
if($title_span){
$title = $title_span->text;
}
}
if (not $title){
$log->error("parse_song_trans_div: empty title: $url");
return;
}
my @contents = ();
$title_el->following('p')->each(sub {
push @contents, shift->content;
});
$title_el->following('div.par')->each(sub {
push @contents, shift->content;
});
my $text = join("\n\n", @contents) . "\n";
my $submit;
my $submit_div = $div->find('div.authorsubmitted')->first;
if ($submit_div){
$submit = parse_submit_user_date($submit_div);
}else{
$log->error("parse_song_trans_div: submit div not found: $url");
return;## FIXME
}
my $last_edit;
my $last_edit_div = $div->find('div.authorsubmitted')->[1];
if ($last_edit_div){
$last_edit = parse_submit_user_date($last_edit_div);
}
my $copyright = "";
my $copyright_div = $div->find('div.copyrighttext')->first;
if ($copyright_div){
$copyright = $copyright_div->content;
}
my $authorcomment = "";
my $authorcomment_div = $div->find('div.authorcomment')->first;
if ($authorcomment_div){
$authorcomment = $authorcomment_div->content;
}
return {
title => $title,
text => $text,
submit => $submit,
last_edit => $last_edit,
copyright => $copyright,
authorcomment => $authorcomment,
};
}
sub parse_artist_page {
my $tx = shift;
my $url = $tx->req->url;
my $page = get_page($url);
my $dom = $tx->res->dom;
my $header_div = $dom->find('div#content-header')->first;
if (not $header_div){
$log->error("element 'div#content-header' not found");
return;
}
my $h1 = $header_div->find('h1.title')->first;
if (not $h1){
$log->error("parse_artist_page: h1.title element not found");
return;
}
my $name = $h1->text =~ s/ lyrics$//r;
my $lang_div = $dom->find('div.breadcrumb-node')->first;
if (not $lang_div){
$log->error("element 'div.breadcrumb-node' not found");
return;
}
my $lang_list = [];
for my $lang_a ($lang_div->find('a')->each){
my $lang_page = get_href_page($lang_a);
if ($lang_page !~ m|^language/.*|) {
if ($lang_page ne $SITE){
$log->error("invalid language href: $lang_page, URL: $url");
}
next;
}
push @$lang_list, {
name => $lang_a->text,
page => $lang_page,
};
}
my $orig_name_li = $dom->find('li.artist-node-info-name')->first;
my $orig_name = '';
if ($orig_name_li){
$orig_name = $orig_name_li->text =~ s/Original name: //r;
}
my $country_li = $dom->find('li.artist-node-info-country')->first;
if (not $country_li){
$log->error("parse_artist_page: country element not found: $url");
return;
}
my $country_a = $country_li->find('a')->first;
if (not $country_a){
$log->error("parse_artist_page: country page not found: $url");
return;
}
my $country_name = $country_a->text;
my $country_page = get_href_page($country_a);
my $genre_li = $dom->find('li.artist-node-info-genre')->first;
my $genre_list = [];
if ($genre_li){
$genre_li->find('a')->each(sub {
my $a = shift;
push @$genre_list, {
name => $a->text,
page => get_href_page($a),
}
});
}
my $site_li = $dom->find('li.artist-node-info-site')->first;
my $site = "";
if ($site_li){
my $site_a = $site_li->find('a')->first;
if ($site_a){
$site = $site_a->attr('href');
}
}
my $wiki_li = $dom->find('li.artist-node-info-wiki')->first;
my $wiki = "";
if ($wiki_li){
my $wiki_a = $wiki_li->find('a')->first;
if ($wiki_a){
$wiki = $wiki_a->attr('href');
}
}
my $image_div = $dom->find('div.artist-node-img')->first;
my $image = "";
if ($image_div){
my $image_img = $image_div->find('img')->first;
if ($image_img) {
$image = $image_img->attr('src');
}
}
my $songs_div = $dom->find('div#artistnodesonglist')->first;
if (not $songs_div) {
$log->error("element 'table#artistsonglist' not found: $url");
return;
}
my $song_pages = artist_get_song_pages($songs_div);
my $feat_songs_div = $songs_div->following('div.artist-node-songlist')->[0];
my $feat_song_pages = artist_get_song_pages($feat_songs_div);
my $performed_songs_div = $songs_div->following('div.artist-node-songlist')->[1];
my $performed_song_pages = artist_get_song_pages($performed_songs_div);
if ($recur->{artist}->{song}) {
for my $song_page (@$song_pages, @$feat_song_pages, @$performed_song_pages){
add_page($song_page);
}
}
if ($recur->{artist}->{translation}) {
## Not Implemented
}
db_save('artist', {
_id => $page,
name => $name,
orig_name => $orig_name,
lang_list => $lang_list,
country => {
name => $country_name,
page => $country_page,
},
genre_list => $genre_list,
wiki => $wiki,
image => $image,
songs => $song_pages,
feat_songs => $feat_song_pages,
performed_songs => $performed_song_pages,
});
}
sub artist_get_song_pages{
my ($parent_element) = @_;
if (not $parent_element){
return [];
}
my $song_pages = [];
for my $song_td ($parent_element->find('td.songName')->each){
my $song_a = $song_td->find('a')->first;
if (not $song_a){
next;
}
push @$song_pages, get_href_page($song_a);
}
return $song_pages;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment