Skip to content

Instantly share code, notes, and snippets.

@joastbg
Created August 8, 2016 05:41
Show Gist options
  • Save joastbg/2e7aa881c64d5a913b31055d9d3d4f65 to your computer and use it in GitHub Desktop.
Save joastbg/2e7aa881c64d5a913b31055d9d3d4f65 to your computer and use it in GitHub Desktop.
#!/usr/bin/perl
use strict;
use warnings;
use locale;
use utf8;
#use open qw(:std :utf8);
#use feature 'unicode_strings';
use Cassandra::Lite;
use Data::Dumper;
use String::Util qw(trim);
use Encode qw(decode);
use Encode::Detect::Detector;
use Time::HiRes qw(gettimeofday);
use HTML::TableExtract;
use Data::Dumper;
use Web::Scraper;
use File::Slurp;
use HTML::TreeBuilder;
use HTML::TreeBuilder::XPath;
use Text::Table;
use JSON::XS
#binmode STDOUT, ":utf8"; # assuming your terminal is UTF-8
binmode STDOUT, ':utf8:raw';
sub replace {
my ($from,$to,$string) = @_;
$string =~s/$from/$to/ig; #case-insensitive/global (all occurrences)
return $string;
}
sub extractlogos {
#print "------------------------------------------\n";
my ($file, $counter) = @_;
#print "Processing: " . $file . "\n";
my $html = read_file( $file );
my $html_raw = Encode::decode("UTF-8", $html);
my $tree = HTML::TreeBuilder::XPath->new_from_content($html_raw);
foreach my $logo_node ( $tree->findnodes('//div[ @class = "logo" ]/img') ) {
my $name = $logo_node->{alt};
utf8::encode($name);
print $name . "\t";
print $logo_node->{src} . "\n";
}
}
sub extractone {
print "------------------------------------------\n";
my ($file, $counter) = @_;
print "Processing: " . $file . "\n";
#my $html = read_file( $file );
#my $html_raw = Encode::decode("UTF-8", $html);
#print $html_raw;
my $root = HTML::TreeBuilder->new_from_file( $file );
my @tables = $root->look_down(_tag => 'div');
while (@tables) {
my $node = shift @tables;
if (ref $node) {
#print Dumper($node) . "\n";
unshift @tables, $node->content_list;
}
}
#if ($html_raw =~ /class="column-middle-content">/) {
# print "Found phone:\t\t$1\n";
#}
#while (my $line=<FILE>) {
# if ($line=~/class="column-middle-content">[\w]*[<](.*)/) {
## print $1 . "\n";
# }
#}
my $html = read_file( $file );
my $html_raw = Encode::decode("UTF-8", $html);
my $tree = HTML::TreeBuilder->new_from_content($html_raw);
#my $tree = HTML::TreeBuilder::XPath->new;
#$tree->parse_file($file);
my @toc = $tree->findnodes('//div[2]/div[4]/div');
my %companies_dict = ();
my @companies = ();
for my $el ( @toc ) {
my %hash = ();
if ($el->as_text()=~/([\w\sÅÄÖåäö]*)Org.nummer: ([\d]{6}-[\d]{4})([^,]+), ([\d]{3} [\d]{2}) ([\w\sÅÄÖåäö]*)Tel: ([\d ]+-[\d ]+)([\.\|\w\s\d:-]*)/) {
my $company = trim($1);
#utf8::encode($company);
my $street = trim($3);
#utf8::encode($street);
my $city = trim($5);
#utf8::encode($city);
$hash{ company } = $company;
$hash{ orgnr } = trim($2);
$hash{ street } = $street;
$hash{ zip } = trim($4);
$hash{ city } = $city;
$hash{ phone } = trim($6);
#print "Company: " . $1 . "\nOrg-nr: " . $2 . "\nStreet: " . $3 . "\nZip: " . $4 . "\nCity: " . $5 . "\nPhone: " . $6 . "\n";
if ($7 =~/Fax: ([\d ]+-[\d ]+)([\.\|\w\s\d:-]*)/) {
my $hemsida = replace("Hemsida: ", "", $2);
#print "Fax: " . $1 . "\nURL: " . $hemsida . "\n";
$hash{ fax } = trim($1);
$hash{ url } = trim($hemsida);
}
push @companies, \%hash;
}
}
#print Dumper(@companies);
#my $utf8_encoded_json_text = encode_json \@companies;
#print $utf8_encoded_json_text;
my $coder = JSON::XS->new->utf8->pretty->allow_nonref;
$companies_dict{ companies } = \@companies;
my $pretty_printed_unencoded = $coder->encode(\%companies_dict);
print $pretty_printed_unencoded;
open (F, ">", "companies" . $counter . ".json") or die "Can't open $file for writing: $!";
print F $pretty_printed_unencoded;
close (F) or die "Can't close companies.json: $!";
print "------------------------------------------\n";
}
sub parseall {
my $directory = '.';
my $counter = 0;
opendir (DIR, $directory) or die $!;
while (my $file = readdir(DIR)) {
if ($file =~ /html/) {
$counter++;
extractlogos($directory . '/' . $file, $counter);
}
}
#print "Docs: " . $counter . "\n";
}
parseall();
#extractlogos("1470451018330.html", 1);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment