gonter · May 14, 2016 02:37
diff --git a/.gitignore b/.gitignore
 *.csv
 *.sql
diff --git a/README.textile b/README.textile
diff --git a/pga.pl b/pga.pl
 #!/usr/bin/perl

 use strict;

 use Data::Dumper;
 $Data::Dumper::Indent= 1;

 use Util::Matrix;

 my $fnm1= shift (@ARGV);
 my $fnm2= shift (@ARGV);

 my $tbl1= pga::read_table ($fnm1);
 $tbl1->mk_idx(0);
 my $tbl2= pga::read_table ($fnm2);
 $tbl2->mk_idx(0);

 my $tbl_x= pga::read_table ('sj_dump.csv');
 my $tbl_y= $tbl_x->filter(2, 1251);
 my $idx_y= $tbl_y->mk_idx(0);

 my $missing= pga::diff ($tbl1, $tbl2);
 my @column_names= @{$tbl1->{'columns'}};

 # print "columns: ", Dumper (\@column_names);
 # print "missing: ", Dumper ($missing);

 push (@column_names, 'hit');
 my $cn_hit= $#column_names;
 my @out_rows;
 foreach my $row (@$missing)
 {
  # print "row: ", Dumper ($row);
  $row->[$cn_hit]= '';
  if (exists ($idx_y->{$row->[4]}))
  {
    $row->[$cn_hit]= 'name';
    push (@out_rows, $row);
  }
 }

 # Util::Matrix::print(\@column_names, $missing);
 Util::Matrix::print(\@column_names, \@out_rows);

 exit (0);

 sub x1
 {
 my $fnm1= 'sj_current.csv';
 my $fnm2= 'sj_dump.csv';

 my $tbl1= pga::read_table ($fnm1);
 $tbl1->mk_idx(0);
 # print "tbl1: ", Dumper ($tbl1);

 my $tbl2= pga::read_table ($fnm2);
 $tbl2->mk_idx(0);

 my $missing= pga::diff ($tbl1, $tbl2);
 # print "columns: ", Dumper ($tbl1->{'columns'});
 # print "missing: ", Dumper ($missing);
 check_1251 ($missing);

 write_inserts ('SchlagwortJelinek', $tbl1->{'columns'}, $missing);
 }

 sub write_inserts
 {
  my $tbl_name= shift;
  my $col_names= shift;
  my $rows= shift;

  my $fnm_sql= 'insert_missing.sql';
  open (SQL, '>:utf8', $fnm_sql) or die;

  my @col_names= @$col_names;
  my $col_count= @col_names;
  my $cnt= 0;
  foreach my $row (@$rows)
  {

    my @col;
    my @val;
    for (my $i= 0; $i < $col_count; $i++)
    {
      if (defined ($row->[$i]) && $row->[$i] ne '')
      {
        push (@col, $col_names[$i]); 
        push (@val, $row->[$i]);
      }
    }

    print SQL "INSERT INTO \"$tbl_name\" (",
           join (',', map { $_ } @col),
        ") VALUES (", 
          join (',', map { "'".$_."'" } @val),");\n";
    $cnt++;
  }
  close (SQL);
  print "written $cnt rows to $fnm_sql\n";
  $cnt;
 }

 =head2 check_1251

 find out if either column 0 or column 2 have the value 1251;
 complain otherwise

 =cut

 sub check_1251
 {
  my $rows= shift;
  foreach my $row (@$rows)
  {
    next if ($row->[0] == 1251 || $row->[2] == 1251);
    print "check row: ", Dumper($row);
  }
 }

 package pga;
 # postgres aligned output

 sub mk_idx
 {
  my $tbl= shift;
  my $col= shift;

  my $idx;
  foreach my $row (@{$tbl->{rows}})
  {
    # print "row: ", main::Dumper ($row);
    my $v= $row->[$col];
    push (@{$idx->{$v}}, $row);
  }

  $tbl->{index}= $idx;
 }

 =head1 filter($table, $column_number, $value)

 return a new table consisting only of those rows from $table that contain the given $value in given $column_number.

 =cut

 sub filter
 {
  my $tbl= shift;
  my $col_num= shift;
  my $val= shift;

  my @out_rows;
  foreach my $row (@{$tbl->{rows}})
  {
    push (@out_rows, $row) if ($row->[$col_num] == $val);
  }

  my $res= { columns => $tbl->{columns}, rows => \@out_rows };
  bless ($res);

  $res;
 }

 =head2 diff ($tbl1, $tbl2)

 look for items from $tbl2 which are not present in $tbl1

 =cut

 sub diff
 {
  my $tbl1= shift;
  my $tbl2= shift;

  my $idx1= $tbl1->{index};
  my $idx2= $tbl2->{index};

  my @missing;
  ROW2: foreach my $id (sort { $a <=> $b } keys %$idx2)
  {
    if (exists ($idx1->{$id}))
    {
      # TODO: check if both rows are identical or report otherwise;
      next ROW2;
    }
    push (@missing, @{$idx2->{$id}});
  }
  (wantarray) ? @missing : \@missing;
 }

 sub read_table
 {
  my $fnm= shift;

  unless (open (FI, '<:utf8', $fnm))
  {
    print "cant read $fnm\n";
    return undef;
  }

  my @lines= <FI>;
  close (FI);

  # TODO: check for empty files or other stuff...
  chop (@lines);
  my $columns= shift (@lines);
  my $rows= pop (@lines);
  my @columns= split(/\|/, $columns);
  # print "columns: ", main::Dumper(\@columns);
  print "rows=[$rows]\n"; # TODO: check if $rows is 
  push (@lines, $rows) unless ($rows =~ m#^\(\d+ rows\)$#);

  my @rows;
  my $res=
  {
    'columns' => \@columns,
    'rows' => \@rows,
  };
  bless ($res);

  foreach my $line (@lines)
  {
    my @l= split (/\|/, $line);
    push (@rows, \@l);
  }

  $res;
 }
	#!/usr/bin/perl

	use strict;

	use Data::Dumper;
	$Data::Dumper::Indent= 1;

	use Util::Matrix;

	my $fnm1= shift (@ARGV);
	my $fnm2= shift (@ARGV);

	my $tbl1= pga::read_table ($fnm1);
	$tbl1->mk_idx(0);
	my $tbl2= pga::read_table ($fnm2);
	$tbl2->mk_idx(0);

	my $tbl_x= pga::read_table ('sj_dump.csv');
	my $tbl_y= $tbl_x->filter(2, 1251);
	my $idx_y= $tbl_y->mk_idx(0);

	my $missing= pga::diff ($tbl1, $tbl2);
	my @column_names= @{$tbl1->{'columns'}};

	# print "columns: ", Dumper (\@column_names);
	# print "missing: ", Dumper ($missing);

	push (@column_names, 'hit');
	my $cn_hit= $#column_names;
	my @out_rows;
	foreach my $row (@$missing)
	{
	# print "row: ", Dumper ($row);
	$row->[$cn_hit]= '';
	if (exists ($idx_y->{$row->[4]}))
	{
	$row->[$cn_hit]= 'name';
	push (@out_rows, $row);
	}
	}

	# Util::Matrix::print(\@column_names, $missing);
	Util::Matrix::print(\@column_names, \@out_rows);

	exit (0);

	sub x1
	{
	my $fnm1= 'sj_current.csv';
	my $fnm2= 'sj_dump.csv';

	my $tbl1= pga::read_table ($fnm1);
	$tbl1->mk_idx(0);
	# print "tbl1: ", Dumper ($tbl1);

	my $tbl2= pga::read_table ($fnm2);
	$tbl2->mk_idx(0);

	my $missing= pga::diff ($tbl1, $tbl2);
	# print "columns: ", Dumper ($tbl1->{'columns'});
	# print "missing: ", Dumper ($missing);
	check_1251 ($missing);

	write_inserts ('SchlagwortJelinek', $tbl1->{'columns'}, $missing);
	}

	sub write_inserts
	{
	my $tbl_name= shift;
	my $col_names= shift;
	my $rows= shift;

	my $fnm_sql= 'insert_missing.sql';
	open (SQL, '>:utf8', $fnm_sql) or die;

	my @col_names= @$col_names;
	my $col_count= @col_names;
	my $cnt= 0;
	foreach my $row (@$rows)
	{

	my @col;
	my @val;
	for (my $i= 0; $i < $col_count; $i++)
	{
	if (defined ($row->[$i]) && $row->[$i] ne '')
	{
	push (@col, $col_names[$i]);
	push (@val, $row->[$i]);
	}
	}

	print SQL "INSERT INTO \"$tbl_name\" (",
	join (',', map { $_ } @col),
	") VALUES (",
	join (',', map { "'".$_."'" } @val),");\n";
	$cnt++;
	}
	close (SQL);
	print "written $cnt rows to $fnm_sql\n";
	$cnt;
	}

	=head2 check_1251

	find out if either column 0 or column 2 have the value 1251;
	complain otherwise

	=cut

	sub check_1251
	{
	my $rows= shift;
	foreach my $row (@$rows)
	{
	next if ($row->[0] == 1251 \|\| $row->[2] == 1251);
	print "check row: ", Dumper($row);
	}
	}

	package pga;
	# postgres aligned output

	sub mk_idx
	{
	my $tbl= shift;
	my $col= shift;

	my $idx;
	foreach my $row (@{$tbl->{rows}})
	{
	# print "row: ", main::Dumper ($row);
	my $v= $row->[$col];
	push (@{$idx->{$v}}, $row);
	}

	$tbl->{index}= $idx;
	}

	=head1 filter($table, $column_number, $value)

	return a new table consisting only of those rows from $table that contain the given $value in given $column_number.

	=cut

	sub filter
	{
	my $tbl= shift;
	my $col_num= shift;
	my $val= shift;

	my @out_rows;
	foreach my $row (@{$tbl->{rows}})
	{
	push (@out_rows, $row) if ($row->[$col_num] == $val);
	}

	my $res= { columns => $tbl->{columns}, rows => \@out_rows };
	bless ($res);

	$res;
	}

	=head2 diff ($tbl1, $tbl2)

	look for items from $tbl2 which are not present in $tbl1

	=cut

	sub diff
	{
	my $tbl1= shift;
	my $tbl2= shift;

	my $idx1= $tbl1->{index};
	my $idx2= $tbl2->{index};

	my @missing;
	ROW2: foreach my $id (sort { $a <=> $b } keys %$idx2)
	{
	if (exists ($idx1->{$id}))
	{
	# TODO: check if both rows are identical or report otherwise;
	next ROW2;
	}
	push (@missing, @{$idx2->{$id}});
	}
	(wantarray) ? @missing : \@missing;
	}

	sub read_table
	{
	my $fnm= shift;

	unless (open (FI, '<:utf8', $fnm))
	{
	print "cant read $fnm\n";
	return undef;
	}

	my @lines= <FI>;
	close (FI);

	# TODO: check for empty files or other stuff...
	chop (@lines);
	my $columns= shift (@lines);
	my $rows= pop (@lines);
	my @columns= split(/\\|/, $columns);
	# print "columns: ", main::Dumper(\@columns);
	print "rows=[$rows]\n"; # TODO: check if $rows is
	push (@lines, $rows) unless ($rows =~ m#^\(\d+ rows\)$#);

	my @rows;
	my $res=
	{
	'columns' => \@columns,
	'rows' => \@rows,
	};
	bless ($res);

	foreach my $line (@lines)
	{
	my @l= split (/\\|/, $line);
	push (@rows, \@l);
	}

	$res;
	}