joastbg · August 8, 2016 05:41
diff --git a/process.pl b/process.pl
 #!/usr/bin/perl

 use strict;
 use warnings;
 use locale;
 use utf8;
 #use open qw(:std :utf8);
 #use feature 'unicode_strings';

 use Cassandra::Lite;
 use Data::Dumper;
 use String::Util qw(trim);
 use Encode qw(decode);
 use Encode::Detect::Detector;
 use Time::HiRes qw(gettimeofday);
 use HTML::TableExtract;
 use Data::Dumper;
 use Web::Scraper;
 use File::Slurp;
 use HTML::TreeBuilder;
 use HTML::TreeBuilder::XPath;
 use Text::Table;
 use JSON::XS

 #binmode STDOUT, ":utf8";  # assuming your terminal is UTF-8
 binmode STDOUT, ':utf8:raw';

 sub replace {
  my ($from,$to,$string) = @_;
  $string =~s/$from/$to/ig;                          #case-insensitive/global (all occurrences)

  return $string;
 }

 sub extractlogos {

    #print "------------------------------------------\n";

    my ($file, $counter) = @_;

    #print "Processing: " . $file . "\n";

    my $html = read_file( $file );
    my $html_raw = Encode::decode("UTF-8", $html);

    my $tree = HTML::TreeBuilder::XPath->new_from_content($html_raw);

    foreach my $logo_node ( $tree->findnodes('//div[ @class = "logo" ]/img') ) {
        my $name = $logo_node->{alt};
        utf8::encode($name);
        print $name . "\t";
        print $logo_node->{src} . "\n";
    }
 }

 sub extractone {

    print "------------------------------------------\n";

    my ($file, $counter) = @_;

    print "Processing: " . $file . "\n";


    #my $html = read_file( $file );
    #my $html_raw = Encode::decode("UTF-8", $html);
    #print $html_raw;

    my $root = HTML::TreeBuilder->new_from_file( $file );
    my @tables = $root->look_down(_tag => 'div');
    while (@tables) {
        my $node = shift @tables;
        if (ref $node) {
            #print Dumper($node) . "\n";
            unshift @tables, $node->content_list;
        }
    }
    #if ($html_raw =~ /class="column-middle-content">/) {
    #    print "Found phone:\t\t$1\n";
    #}
    #while (my $line=<FILE>) {
    #    if ($line=~/class="column-middle-content">[\w]*[<](.*)/) {
    ##       print $1 . "\n";
    #    }
    #}

    my $html = read_file( $file );
    my $html_raw = Encode::decode("UTF-8", $html);
    my $tree = HTML::TreeBuilder->new_from_content($html_raw);
    #my $tree = HTML::TreeBuilder::XPath->new;

    #$tree->parse_file($file);

    my @toc = $tree->findnodes('//div[2]/div[4]/div');

    my %companies_dict = ();
    my @companies = ();

    for my $el ( @toc ) {

        my %hash = ();

        if ($el->as_text()=~/([\w\sÅÄÖåäö]*)Org.nummer: ([\d]{6}-[\d]{4})([^,]+), ([\d]{3} [\d]{2}) ([\w\sÅÄÖåäö]*)Tel: ([\d ]+-[\d ]+)([\.\|\w\s\d:-]*)/) {

            my $company = trim($1);
            #utf8::encode($company);

            my $street = trim($3);
            #utf8::encode($street);

            my $city = trim($5);
            #utf8::encode($city);

            $hash{ company } = $company;
            $hash{ orgnr } = trim($2);
            $hash{ street } = $street;
            $hash{ zip } = trim($4);
            $hash{ city } = $city;
            $hash{ phone } = trim($6);

            #print "Company: " . $1 . "\nOrg-nr: " . $2 . "\nStreet: " . $3 . "\nZip: " . $4 . "\nCity: " . $5 . "\nPhone: " . $6 . "\n";

            if ($7 =~/Fax: ([\d ]+-[\d ]+)([\.\|\w\s\d:-]*)/) {
                my $hemsida = replace("Hemsida: ", "", $2);
                #print "Fax: " . $1 . "\nURL: " . $hemsida . "\n";

                $hash{ fax } = trim($1);
                $hash{ url } = trim($hemsida);
            }

            push @companies, \%hash;

        }
    }

    #print Dumper(@companies);

    #my $utf8_encoded_json_text = encode_json \@companies;

    #print $utf8_encoded_json_text;

    my $coder = JSON::XS->new->utf8->pretty->allow_nonref;

    $companies_dict{ companies } = \@companies;

    my $pretty_printed_unencoded = $coder->encode(\%companies_dict);

    print $pretty_printed_unencoded;

    open (F, ">", "companies" . $counter . ".json") or die "Can't open $file for writing: $!";

    print F $pretty_printed_unencoded;
    close (F) or die "Can't close companies.json: $!";

    print "------------------------------------------\n";
 }

 sub parseall {

    my $directory = '.';
    my $counter = 0;

    opendir (DIR, $directory) or die $!;

    while (my $file = readdir(DIR)) {
        if ($file =~ /html/) {
            $counter++;
            extractlogos($directory . '/' . $file, $counter);
        }
    }

    #print "Docs: " . $counter . "\n";
 }

 parseall();
 #extractlogos("1470451018330.html", 1);
	#!/usr/bin/perl

	use strict;
	use warnings;
	use locale;
	use utf8;
	#use open qw(:std :utf8);
	#use feature 'unicode_strings';

	use Cassandra::Lite;
	use Data::Dumper;
	use String::Util qw(trim);
	use Encode qw(decode);
	use Encode::Detect::Detector;
	use Time::HiRes qw(gettimeofday);
	use HTML::TableExtract;
	use Data::Dumper;
	use Web::Scraper;
	use File::Slurp;
	use HTML::TreeBuilder;
	use HTML::TreeBuilder::XPath;
	use Text::Table;
	use JSON::XS

	#binmode STDOUT, ":utf8"; # assuming your terminal is UTF-8
	binmode STDOUT, ':utf8:raw';

	sub replace {
	my ($from,$to,$string) = @_;
	$string =~s/$from/$to/ig; #case-insensitive/global (all occurrences)

	return $string;
	}

	sub extractlogos {

	#print "------------------------------------------\n";

	my ($file, $counter) = @_;

	#print "Processing: " . $file . "\n";

	my $html = read_file( $file );
	my $html_raw = Encode::decode("UTF-8", $html);

	my $tree = HTML::TreeBuilder::XPath->new_from_content($html_raw);

	foreach my $logo_node ( $tree->findnodes('//div[ @class = "logo" ]/img') ) {
	my $name = $logo_node->{alt};
	utf8::encode($name);
	print $name . "\t";
	print $logo_node->{src} . "\n";
	}
	}

	sub extractone {

	print "------------------------------------------\n";

	my ($file, $counter) = @_;

	print "Processing: " . $file . "\n";


	#my $html = read_file( $file );
	#my $html_raw = Encode::decode("UTF-8", $html);
	#print $html_raw;

	my $root = HTML::TreeBuilder->new_from_file( $file );
	my @tables = $root->look_down(_tag => 'div');
	while (@tables) {
	my $node = shift @tables;
	if (ref $node) {
	#print Dumper($node) . "\n";
	unshift @tables, $node->content_list;
	}
	}
	#if ($html_raw =~ /class="column-middle-content">/) {
	# print "Found phone:\t\t$1\n";
	#}
	#while (my $line=<FILE>) {
	# if ($line=~/class="column-middle-content">[\w][<](.)/) {
	## print $1 . "\n";
	# }
	#}

	my $html = read_file( $file );
	my $html_raw = Encode::decode("UTF-8", $html);
	my $tree = HTML::TreeBuilder->new_from_content($html_raw);
	#my $tree = HTML::TreeBuilder::XPath->new;

	#$tree->parse_file($file);

	my @toc = $tree->findnodes('//div[2]/div[4]/div');

	my %companies_dict = ();
	my @companies = ();

	for my $el ( @toc ) {

	my %hash = ();

	if ($el->as_text()=~/([\w\sÅÄÖåäö])Org.nummer: ([\d]{6}-[\d]{4})([^,]+), ([\d]{3} [\d]{2}) ([\w\sÅÄÖåäö])Tel: ([\d ]+-[\d ]+)([\.\\|\w\s\d:-]*)/) {

	my $company = trim($1);
	#utf8::encode($company);

	my $street = trim($3);
	#utf8::encode($street);

	my $city = trim($5);
	#utf8::encode($city);

	$hash{ company } = $company;
	$hash{ orgnr } = trim($2);
	$hash{ street } = $street;
	$hash{ zip } = trim($4);
	$hash{ city } = $city;
	$hash{ phone } = trim($6);

	#print "Company: " . $1 . "\nOrg-nr: " . $2 . "\nStreet: " . $3 . "\nZip: " . $4 . "\nCity: " . $5 . "\nPhone: " . $6 . "\n";

	if ($7 =~/Fax: ([\d ]+-[\d ]+)([\.\\|\w\s\d:-]*)/) {
	my $hemsida = replace("Hemsida: ", "", $2);
	#print "Fax: " . $1 . "\nURL: " . $hemsida . "\n";

	$hash{ fax } = trim($1);
	$hash{ url } = trim($hemsida);
	}

	push @companies, \%hash;

	}
	}

	#print Dumper(@companies);

	#my $utf8_encoded_json_text = encode_json \@companies;

	#print $utf8_encoded_json_text;

	my $coder = JSON::XS->new->utf8->pretty->allow_nonref;

	$companies_dict{ companies } = \@companies;

	my $pretty_printed_unencoded = $coder->encode(\%companies_dict);

	print $pretty_printed_unencoded;

	open (F, ">", "companies" . $counter . ".json") or die "Can't open $file for writing: $!";

	print F $pretty_printed_unencoded;
	close (F) or die "Can't close companies.json: $!";

	print "------------------------------------------\n";
	}

	sub parseall {

	my $directory = '.';
	my $counter = 0;

	opendir (DIR, $directory) or die $!;

	while (my $file = readdir(DIR)) {
	if ($file =~ /html/) {
	$counter++;
	extractlogos($directory . '/' . $file, $counter);
	}
	}

	#print "Docs: " . $counter . "\n";
	}

	parseall();
	#extractlogos("1470451018330.html", 1);