Skip to content

Instantly share code, notes, and snippets.

@mmterpstra
Last active October 28, 2016 12:28
Show Gist options
  • Select an option

  • Save mmterpstra/b1939727e57944f3270c to your computer and use it in GitHub Desktop.

Select an option

Save mmterpstra/b1939727e57944f3270c to your computer and use it in GitHub Desktop.
molgenis-compute tools
use warnings;
use strict;
use Data::Dumper;
main();
sub main {
InfoMsg("Use 'perl $0 /Path/To/Compute/Jobs/*.sh'\n");
InfoMsg("Commandline : $0 ".join(" ",@ARGV)."\n");
DuplicateParameterRemoval(@ARGV);
}
sub DuplicateParameterRemoval{
for my $shFile (@_){
InfoMsg("File $shFile does not exist!!!")if(! -e $shFile);
next if(! -e $shFile);
my $sh = ShLoader($shFile);
$sh = ShDupRemove($sh);
#print Dumper(\$sh);
ShWriter($sh);
}
}
sub ShLoader {
my $shFile = shift(@_);
my $self -> {'file'} = $shFile;
InfoMsg ("Loading sh file ". $self -> {'file'});
open(my $shHandle, "<", $shFile) or die "cannot open $shFile";
while(<$shHandle>){
$self -> {'line'} -> {$.} = $_;
push(@{$self -> {'order'}} , $.);
}
close $shHandle;
return $self;
}
sub ShWriter {
my $self = shift @_;
#open(my $shHandle, ">", $shFile) or die "cannot open $shFile";
#DebugMsg("Making backups and writing to " . $self -> {'file'});
my $mvCmd = "mv -v " . $self -> {'file'}." " . $self -> {'file'}.".bak_".time.".sh";
InfoMsg($mvCmd);
InfoMsg(`$mvCmd`);
open(my $shHandle, ">", $self -> {'file'}) or die "cannot open $self -> {'file'}";
#InfoMsg(Dumper($self -> {'order'}));
for my $lIndex (@{$self -> {'order'}}){
print $shHandle $self -> {'line'} -> {$lIndex};
}
close $shHandle;
}
sub ShDupRemove {
my $self = shift @_;
InfoMsg("Removing static array dups ". $self -> {'file'});
my $ignoreLineLen=300;
my $fileLen = scalar(keys(%{$self -> {'line'}}));
#DebugMsg("$fileLen");
#copy metadata
my $selfNew;
$selfNew -> {'file'} = $self -> {'file'};
$selfNew -> {'order'} = $self -> {'order'};
#process lines removing weaved array dups in long commands
for my $lIndex (keys(%{$self -> {'line'}})){
if(length($self -> {'line'} -> {$lIndex}) < $ignoreLineLen){
$selfNew -> {'line'} -> {$lIndex} = $self -> {'line'} -> {$lIndex};
}else{
$selfNew -> {'line'} -> {$lIndex} = ScanArrayDupsAndRemove($self -> {'line'} -> {$lIndex});
}
}
#look for static array declarations a la molgenis
$selfNew = ShStaticArrayDeclareRemover($selfNew);
return $selfNew;
}
sub ShStaticArrayDeclareRemover {
my $self = shift @_;
InfoMsg("Removing array declare dups ". $self -> {'file'});
#copy metadata
my $selfNew;
$selfNew -> {'file'} = $self -> {'file'};
$selfNew -> {'order'} = $self -> {'order'};
for my $lIndex (@{$self -> {'order'}}){
$selfNew -> {'line'} -> {$lIndex} = $self -> {'line'} -> {$lIndex};
warn $lIndex if not ($selfNew -> {'line'} -> {$lIndex});
if($selfNew -> {'line'} -> {$lIndex} =~ m!^(\w*)\[\d+\]\=(\"[\/\w\._\-\*]*\")$!){
#this code should collect / merge data from array defention blocks and clean then up
#if same variable as last > continue collecting
#if other variable -> cleanup prevous variable by reinjecting previous lines and initiate collect of other variable
#$selfNew -> {'line'} -> {$lIndex} =~ m!^(\w*)\[\d+\]\=(\"[\/\w\._\-\*]*\")$!;
#InfoMsg("call cleanup ") if ($selfNew -> {'declare_last'} ne $1);
if(not(defined($selfNew -> {'declare_last'}))){
$selfNew -> {'declare_last'} = $1;
$selfNew -> {'declare'} -> {$1} -> {'variablekey'} = $1;
$selfNew -> {'declare'} -> {$1} -> {'line_first'} = $lIndex;
$selfNew -> {'declare'} -> {$1} -> {'line_last'} = $lIndex;
$selfNew -> {'declare'} -> {$1} -> {'variableval'} -> {$lIndex} = $2;
$selfNew -> {'declare'} -> {$1} -> {'variableuniq'} -> {$2} ++;
}elsif(defined($selfNew -> {'declare_last'}) && $selfNew -> {'declare_last'} eq $1 && $selfNew -> {'declare'} -> {$1} -> {'line_last'} + 1 == $lIndex){
$selfNew -> {'declare_last'} = $1;
$selfNew -> {'declare'} -> {$1} -> {'variablekey'} = $1;
$selfNew -> {'declare'} -> {$1} -> {'line_last'} = $lIndex;
#$selfNew -> {'declare'} -> {$1} -> {'line_first'} = $lIndex if (not(defined($selfNew -> {'declare'} -> {'line_first'})));
$selfNew -> {'declare'} -> {$1} -> {'variableval'} -> {$lIndex} = $2;
$selfNew -> {'declare'} -> {$1} -> {'variableuniq'} -> {$2} ++;
}elsif(defined($selfNew -> {'declare_last'}) && $selfNew -> {'declare_last'} ne $1 || $selfNew -> {'declare'} -> {$1} -> {'line_last'} + 1 != $lIndex){
#cleanup;
InfoMsg("Removing array declare dups: call CleanUpDuplicatesArrayDefinition ". $self -> {'file'});
$selfNew=CleanUpDuplicatesArrayDefinition($selfNew, $selfNew -> {'declare_last'});
#reassign vars
$selfNew -> {'line'} -> {$lIndex} =~ m!^(\w*)\[\d+\]\=(\"[\/\w\._\-\*]*\")$!;
$selfNew -> {'declare_last'} = $1;
$selfNew -> {'declare'} -> {$1} -> {'variablekey'} = $1;
$selfNew -> {'declare'} -> {$1} -> {'line_first'} = $lIndex;
$selfNew -> {'declare'} -> {$1} -> {'variableval'} -> {$lIndex} = $2;
$selfNew -> {'declare'} -> {$1} -> {'line_last'} = $lIndex;
$selfNew -> {'declare'} -> {$1} -> {'variableuniq'} -> {$2} ++;
}else{
die "uncaught error $lIndex";
}
#$selfNew -> {'declare_last'} = {$1};
#$selfNew -> {'declare'} -> {$1} -> {'variablekey'} = $1;
#$selfNew -> {'declare'} -> {$1} -> {'line_last'} = $lIndex;
#$selfNew -> {'declare'} -> {$1} -> {'line_first'} = $lIndex if (not(defined($selfNew -> {'declare'} -> {'line_first'})));
#$selfNew -> {'declare'} -> {$1} -> {'variableval'} -> {$lIndex} = $2;
}
}
if(defined($selfNew -> {'declare_last'})){
$selfNew = CleanUpDuplicatesArrayDefinition($selfNew, $selfNew -> {'declare_last'})
}
return $selfNew;
}
sub CleanUpDuplicatesArrayDefinition {
my $self = shift @_;
InfoMsg("Removing duplicates of ".$_[0]);
#for safety count the amount of lines between line first / last
my $key =shift @_;#arrayname to cleanup
my $count = 0;
for my $lindex (@{$self -> {'order'}}){
if ($self -> {'declare'} -> {$key} -> {'line_last'} >= $lindex &&
$self -> {'declare'} -> {$key} -> {'line_first'} <= $lindex){
$count++;
}
}
DebugMsg( "something strange has happend more variablevals then lines to declare them on!" , \$count,scalar(keys(%{$self -> {'declare'} -> {$key} -> {'variableval'}})) )if(scalar(keys(%{$self -> {'declare'} -> {$key} -> {'variableval'}})) > $count);
#make array definitions
my @array = ();
my $arrayIndex=0;
for my $arrayval (keys %{$self -> {'declare'} -> {$key} -> {'variableuniq'}}){
push(@array, ${key}."\[".$arrayIndex."\]=".$arrayval."\n");
$arrayIndex++;
}
#cleanup by redefing some lines and empty stringing the rest
for my $lIndex (@{$self -> {'order'}}){
if($self -> {'declare'} -> {$key} -> {'line_last'} >= $lIndex && $self -> {'declare'} -> {$key} -> {'line_first'} <= $lIndex && scalar(@array) > 0){
$self -> {'line'} -> {$lIndex} = shift @array;
}elsif($self -> {'declare'} -> {$key} -> {'line_last'} >= $lIndex && $self -> {'declare'} -> {$key} -> {'line_first'} <= $lIndex && scalar(@array) == 0){
$self -> {'line'} -> {$lIndex} = "";
}
}
return $self;
}
sub ScanArrayDupsAndRemove {
my $line = shift @_;
my $lineLen= length($line);
if($line =~ m!\"[\/\w\._\-\*]*\"( \"[\/\w\._\-\*]*\")+!){
my $arraymatchRes;
$arraymatchRes -> {'match'} = $&;
my $index = index($line, $arraymatchRes -> {'match'});
$arraymatchRes -> {'pre'} = substr($line,0,$index);
$arraymatchRes -> {'post'} = substr($line,$index+length($arraymatchRes -> {'match'}));
#InfoMsg(Dumper($arraymatchRes));
my @vals = split(' ',$arraymatchRes -> {'match'});
my %uniq;
map{$uniq{$_}++;}(@vals);
$arraymatchRes -> {'match'} =join(" ", keys (%uniq));
#InfoMsg(Dumper($arraymatchRes));
$arraymatchRes -> {'post'} = ScanArrayDupsAndRemove( $arraymatchRes -> {'post'});
$line = $arraymatchRes -> {'pre'} . $arraymatchRes -> {'match'} . $arraymatchRes -> {'post'};
}
#for (my $i = 0; $i < $lineLen; $i++){
# my $char = substr($line,$i,1);
#
#}
return $line;
}
sub InfoMsg {
warn "### ".localtime(time())." ### " . join("\n ### ",@_) . "\n";
}
sub DebugMsg {
warn "### ".localtime(time())." ### " . join("\n ### ",@_) . "\n";
die Dumper (\@_);
}
use warnings;
use strict;
use Data::Dumper;
my $use =<<"END";
use: perl $0 samplesheet.csv
Validator tool.
Exits with error if there are errors in the samplesheet.
END
main();
sub main {
my $mergedSamplesheet;
while(scalar(@ARGV)){
my $samplesheetfile = shift @ARGV;
my $samplesheet = ReadSamplesheet($samplesheetfile);
push(@$mergedSamplesheet,@$samplesheet);
}
ValidateControlSampleNames($mergedSamplesheet);
print SamplesheetAsString($mergedSamplesheet);
}
sub ReadSamplesheet {
my $samplesheetf = shift @_;
my $samplesheet;
open( my $samplesheeth,'<', $samplesheetf) or die "Cannot read samplesheet file:'$samplesheetf'";
$_=<$samplesheeth>;
chomp;
my @h = CommaseparatedSplit($_);
#die Dumper(\@h);
while(<$samplesheeth>){
chomp;
my @c = CommaseparatedSplit($_);
ValidateColvalues(\@h,\@c);
#die Dumper(\@c);
my %d;
my $i=0;
map{$d{$_}=$c[$i]; $i++;}(@h);
$i++;
#$c[$i]=join(",",@h);
#ReadFileNameConstructor(\%d);
push(@$samplesheet,\%d);
}
return $samplesheet;
}
sub ValidateColvalues {
my $header = shift @_;
my $columns = shift @_;
if(scalar(@{$header}) ne scalar(@{$columns})){
die "[VALIDATIONERROR] no of columns (".scalar(@{$columns}).") in line $. are not equal to columns in header ".scalar(@{$columns})
.".\nArray dump of header".Dumper($header)
.".\nArray dump of $. columns".Dumper($columns)." ";
}
}
sub ValidateControlSampleNames {
my $samplesheet = shift @_;
#my $samplesh
return if (not(defined($samplesheet -> [0] -> {'controlSampleName'})));
my @s;#sample
map{push(@s, $_ -> {'sampleName'})}(@{$samplesheet});
my $i=1;
my $cs; #controlsample
map{push(@{$cs}, {'controlSampleName' => $_ -> {'controlSampleName'}, 'line' => $i}); $i++;}(@{$samplesheet});
my $error = "";
map{
my $csname = $_ -> {'controlSampleName'};
my $csline = $_ -> {'line'};
my $seen = 0;
map{
$seen++ if($_ eq $csname);
}(@s);
$error = "'".$csname."' at $csline," if(not($seen));
}(@{$cs});
chop $error;
die "[VALIDATIONERROR] the following samplename(s) are seen in the controlsamplenames row but not seen in the samplenames row (format'controlsaplename' at \$lineno) : \n".$error if($error ne "");
}
sub CommaseparatedSplit {
my $string=pop @_;
#needs to be fixed for citation marks!
warn "Line contains citation marks: this is currently not supported!!. I hope this works. Line=$_" if($string =~ /"|'/);
my $i = index($string,",");
if( $i > -1){
push(@_,substr($string,0,$i));
push(@_,substr($string,$i+1));
@_ = CommaseparatedSplit( @_ );
}else{
push(@_,$string);
return @_;
}
}
sub SamplesheetAsString {
my $self = shift @_;
my $string = '';
#get header values;
my %h;
for my $sample (@$self){
for my $key (keys(%$sample)){
$h{$key}++;
}
}
my @h = sort {$b cmp $a} (keys(%h));
$string.=join(",",@h)."\n";
#warn scalar(@$self);
for my $sample (@$self){
my @c;
for my $h (@h){
push (@c,$$sample{$h});
}
$string.=join(",",@c)."\n";
}
return $string;
}
@mmterpstra
Copy link
Author

Todo for keeping consistent sorting:
216 till 219 for weaved variables (more complex object needed for tracking the indexes then a join on sorted keys by index)
187 till 190 for array declarations (first insert tracker on 143 something like $selfNew -> {'declare'} -> {$1} -> {'variableuniqlfirst'} -> {$2} =$lIndex;)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment