Skip to content

Instantly share code, notes, and snippets.

@kristiannissen
Created August 15, 2019 11:57
Show Gist options
  • Select an option

  • Save kristiannissen/cb897cb6d8dda26da3ca879a21474420 to your computer and use it in GitHub Desktop.

Select an option

Save kristiannissen/cb897cb6d8dda26da3ca879a21474420 to your computer and use it in GitHub Desktop.
Perl Crawler
#!/usr/local/bin/perl
use 5.10.0;
use strict;
use warnings;
use Data::Dumper;
use LWP::RobotUA;
use HTML::TreeBuilder;
use HTML::Element;
use URI;
use DBI;
# Db setup
my $db_file = "searchengine.db";
my $dbh = DBI->connect("dbi:SQLite:dname=$db_file", "", "", {RaiseError => 1}) or die $DBI::errstr;
# Pass domain from command line
my $domain = $ARGV[0];
# Create URL for testing
my $original_url = URI->new_abs("/", $domain);
# Create LWP::RobotUA using the UserAgent
my $robot_agent = new LWP::RobotUA('iAmARobot', '[email protected]');
$robot_agent->delay(0.5);
say "Crawling $domain";
my $request = $robot_agent->get($domain);
if ($request->is_success) {
my $html_tree = HTML::TreeBuilder->new()->parse_content($request->content);
for (@{ $html_tree->extract_links('a') }) {
my ($link, $element, $attr, $tag) = @$_;
my $url = URI->new_abs($link, $domain);
# say $url;
say $url if $url->host() eq $original_url->host();
$dbh->do("INSERT INTO urllist (url) VALUES (\"$url\")") if $url->host() eq $original_url->host();
}
# say $html_tree->as_trimmed_text();
} else {
say $request->status_line;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment