Skip to content

Instantly share code, notes, and snippets.

@sashaphanes
Created October 11, 2012 17:39
Show Gist options
  • Save sashaphanes/3874181 to your computer and use it in GitHub Desktop.
Save sashaphanes/3874181 to your computer and use it in GitHub Desktop.
tableMerger
#!/usr/bin/perl
use strict;
require "getopts.pl";
use vars qw ($opt_f $opt_p $opt_q $opt_s $opt_t $opt_l $opt_m $opt_F $opt_Q $opt_L $opt_M $opt_c $opt_o $opt_h $opt_H);
&Getopts('f:p:q:l:s:t:m:F:Q:L:M:c:o:hH:');
my $executable = "merge.basedOn.column.pl";
$| = 1;
my $usage = qq{
###################################################################
usage: $executable -f file1 -q file2 -l file1_column -m file2_column [-o joining_option] > output
(used for merge two flat files like in a database)
# -f first file (tab separator for each column); use stdin if the input file not specified
# -p a list file with multiple first files (-f). Each file in one line; all must be in the same format
#
# -q second file (tab separator for each column)
#
# -l N:the joining column number of the first file
# if multiple columns use like 1,2,3
#
# -L ignore char in columns specified in -l:
# if specified, such as "chr", it will be ignored in the combined string from -l
# and not affect the joining
#
# -m N:the joining column number of the second file
# if multiple columns use like 1,2,3
#
# -M ignore char in columns specified in -M:
# if specified, such as "chr", it will be ignored in the combined string from -m
# and not affect the joining
#
# -F the row number of file header in -f [default: 1] (0: no header)
# -Q the row number of file header in -q [default: 1] (0: no header)
#
# -c case senstive for the joining data
# Y: yes [default]
# N: no
#
# -o joining option:
# L: left joining: all first file and matched second file
# default (not specified): equal joining: only matched for both
# N: left joining, but exclude those records matched in second file
#
# -s field separator:
# default: <tab>
# S: space+
# C: comma
#
# -t field separator for file2 [default as in -s]
# T: tab
# S: space+
# C: comma
#
# -hH print this
####################################################################
\n};
my ($file1, $file1ListName, %file1List, $file2, $file1Col, $file2Col, $joinOpt, $separator, $file2Separator, $caseSenstive, $file1HeaderRow,$file2HeaderRow, $ignoreL, $ignoreM, $splitPattern, $split2Pattern);
# options:
if (length($opt_h) > 0 || length($opt_H) > 0 ) {die($usage);}
if (-s $opt_f or -s $opt_p) {
$file1List{$opt_f} = 1 if ($opt_f);
if (-s $opt_p) {
$file1ListName = $opt_p;
if ($file1ListName =~ /\.gz$/i) {
open (LIST, "zcat $file1ListName |") ||die ("Cant open $file1ListName!\n");
}else {
open (LIST, "$file1ListName") ||die ("Cant open $file1ListName!\n");
}
while (<LIST>) {
chomp;
$file1 = $_;
$file1List{$file1} = 1 if (-s $file1);
}
close (LIST);
}
} else {
#die($usage);
$file1List{"stdin"} = 1;
}
if (length($opt_q) > 0) {$file2 = $opt_q;} else {die($usage);}
#if (length($opt_l) > 0) {$file1Col = $opt_l - 1;} else {die($usage);}
#if (length($opt_m) > 0) {$file2Col = $opt_m - 1;} else {die($usage);}
if (length($opt_l) > 0) {$file1Col = $opt_l ;} else {die($usage);}
if (length($opt_m) > 0) {$file2Col = $opt_m ;} else {die($usage);}
if (length($opt_F) > 0 and $opt_F >= 0 ) {$file1HeaderRow = $opt_F; } else {$file1HeaderRow = 1; }
if (length($opt_Q) > 0 and $opt_Q >= 0 ) {$file2HeaderRow = $opt_Q; } else {$file2HeaderRow = 1; }
if (length($opt_L) > 0 ) {$ignoreL = $opt_L; }
if (length($opt_M) > 0 ) {$ignoreM = $opt_M; }
if (length($opt_o) > 0) {$joinOpt = $opt_o;}
if (length($opt_s) > 0 and $opt_s =~ /^s/i) {
$separator ="space";
$splitPattern ="\\s\+";
}elsif ($opt_s =~ /^C/i) {
$separator ="comma";
$splitPattern ="\\,";
}else {
$separator ="tab";
$splitPattern ="\\t";
}
if ($opt_t =~ /^S/i) {
$file2Separator = "space";
$split2Pattern ="\\s\+";
}elsif ($opt_t =~ /^C/i ) {
$file2Separator = "comma";
$split2Pattern ="\\,";
}elsif ($opt_t =~ /^T/i ) {
$file2Separator = "tab";
$split2Pattern ="\\t";
}else {
$file2Separator = $separator;
$split2Pattern = $splitPattern;
}
if ($opt_c =~ /^N/i) {$caseSenstive = 0; } else {$caseSenstive = 1; }
my $zeroPlus = "0plus";
my $lineCount2 = 0;
my $lineCount1 = 0;
my ($line);
my (@columns1, $tempCount);
if ($file2 =~ /\.gz$/i) {
open (FILE2, "zcat $file2 |") ||die ("Cant open $file2!\n");
}else {
open (FILE2, "$file2") ||die ("Cant open $file2!\n");
}
my (%records2, @columns2, $maxCol2, $colNo2);
my ($key1, $key2, $colNm, $colAI, @file1Cols, @file2Cols,@file1ColAIs, @file2ColAIs, $file1ColsNm, $file2ColsNm);
@file1Cols = split (/\,/, $file1Col);
@file2Cols = split (/\,/, $file2Col);
$file1ColsNm = @file1Cols;
$file2ColsNm = @file2Cols;
unless ($file1ColsNm == $file2ColsNm) {
die("Error: need same total column numbers specified in -l and -m\n $usage");
}
foreach $colNm (@file1Cols ) {
die("Error: column numbers must >0 in -m\n $usage") if ($colNm <1);
$colAI = $colNm - 1;
push (@file1ColAIs, $colAI);
}
foreach $colNm (@file2Cols ) {
die("Error: column numbers must >0 in -m\n $usage") if ($colNm <1);
$colAI = $colNm - 1;
push (@file2ColAIs, $colAI);
}
my ($file1Header, $file2Header);
if ($file2HeaderRow > 0 ) {
for ($lineCount2 = 1; $lineCount2 <= $file2HeaderRow; $lineCount2++) {
$file2Header = <FILE2>;
}
chomp ($file2Header);
}
if ($ignoreM) {
while (<FILE2>) {
chomp;
$line = $_;
$lineCount2++;
#$file2Header = $_ if ($lineCount2 == $file2HeaderRow );
#if ($file2Separator eq "space") {
# @columns2 = split (/\s+/, $line);
# }elsif ($file2Separator eq "comma" ) {
# @columns2 = split(/\,/, $line);
# }else {
# @columns2 = split (/\t/, $line);
# }
@columns2 = split (/$split2Pattern/, $line);
$colNo2 = @columns2;
$maxCol2 = $colNo2 if ($colNo2 > $maxCol2);
$key2 = "";
foreach $colAI (@file2ColAIs) {
$columns2[$colAI] = $zeroPlus if ($columns2[$colAI] eq "0" ); # so that 0 key will be included
$columns2[$colAI] = "N/A" unless ($columns2[$colAI]);
$key2 .= "$columns2[$colAI]".".";
}
# $columns2[$file2Col] = $zeroPlus if ($columns2[$file2Col] eq "0" ); # so that 0 key will be included
# $columns2[$file2Col] = "N/A" unless ($columns2[$file2Col]);
$key2 =~ s/\.$//;
#$records2{$columns2[$file2Col]}{$lineCount2} = $line;
$key2 =~ s/$ignoreM//g;
$key2 = uc($key2) if ($caseSenstive == 0 );
$records2{$key2}{$lineCount2} = $line;
}
}else {
while (<FILE2>) {
chomp;
$line = $_;
$lineCount2++;
#$file2Header = $_ if ($lineCount2 == $file2HeaderRow );
# if ($file2Separator eq "space") {
# @columns2 = split (/\s+/, $line);
# }elsif ($file2Separator eq "comma" ) {
# @columns2 = split(/\,/, $line);
# }else {
# @columns2 = split (/\t/, $line);
# }
@columns2 = split (/$split2Pattern/, $line);
$colNo2 = @columns2;
$maxCol2 = $colNo2 if ($colNo2 > $maxCol2);
$key2 = "";
foreach $colAI (@file2ColAIs) {
$columns2[$colAI] = $zeroPlus if ($columns2[$colAI] eq "0" ); # so that 0 key will be included
$columns2[$colAI] = "N/A" unless ($columns2[$colAI]);
$key2 .= "$columns2[$colAI]".".";
}
# $columns2[$file2Col] = $zeroPlus if ($columns2[$file2Col] eq "0" ); # so that 0 key will be included
# $columns2[$file2Col] = "N/A" unless ($columns2[$file2Col]);
$key2 =~ s/\.$//;
#$records2{$columns2[$file2Col]}{$lineCount2} = $line;
$key2 = uc($key2) if ($caseSenstive == 0 );
$records2{$key2}{$lineCount2} = $line;
}
}
close (FILE2);
my ($file1Count, $joinLabel);
if ($joinOpt =~ /^L/i) {
$joinLabel = "--->";
}elsif ($joinOpt =~ /^N/i ) {
$joinLabel = "<---";
}else {
$joinLabel = "<-->";
}
my $FILE1;
foreach $file1 (sort keys %file1List) {
$file1Count++;
if ($file1 eq "stdin") {
$FILE1 = \*STDIN;
}elsif ($file1 =~ /\.gz$/i) {
open ($FILE1, "zcat $file1 |") || die ("Cant open $file1!\n");;
}else {
open ($FILE1, "$file1") ||die ("Cant open $file1!\n");
}
if ($file1HeaderRow > 0 ) {
for ($lineCount1 = 1; $lineCount1 <= $file1HeaderRow; $lineCount1++) {
$file1Header = <$FILE1>;
}
chomp ($file1Header);
if ($file1Count == 1 and $file1Header ) {
#if ($joinOpt =~ /^L/i) {
# print "$file1Header\t---\>\t$file2Header\n";
#}else {
# print "$line\t<--\>\t$file2Header\n";
#}
print "$file1Header\t$joinLabel\t$file2Header\n";
}
}
if ($ignoreL) {
while (<$FILE1>) {
chomp;
$line = $_;
$lineCount1++;
# $file1Header = $_ if ($lineCount1 == $file1HeaderRow );
# if ($separator eq "space") {
# @columns1 = split (/\s+/, $line);
# }elsif ($separator eq "comma" ) {
# @columns1 = split(/\,/, $line);
# }else {
# @columns1 = split (/\t/, $line);
# }
@columns1 = split (/$splitPattern/, $line);
$key1 = "";
foreach $colAI (@file1ColAIs) {
$columns1[$colAI] = $zeroPlus if ($columns1[$colAI] eq "0" ); # so that 0 key will be included
$columns1[$colAI] = "N/A" unless ($columns1[$colAI]);
$key1 .= "$columns1[$colAI]".".";
}
$key1 =~ s/\.$//;
$key1 =~ s/$ignoreL//g;
# $columns1[$file1Col] = $zeroPlus if ($columns1[$file1Col] eq "0" ); # so that 0 key will be included
# $columns1[$file1Col] = "N/A" unless ($columns1[$file1Col]);
#if ($columns1[$file1Col] ne "N/A" and defined %{$records2{$columns1[$file1Col]}} and $joinOpt !~ /^N/i) {
$key1 = uc($key1) if ($caseSenstive == 0 );
#if ($file1Header and $lineCount1 == $file1HeaderRow ) {
# if ($joinOpt =~ /^L/i) {
# print "$file1Header\t---\>\t$file2Header\n";
# }else {
# print "$line\t<--\>\t$file2Header\n";
# }
#}elsif ($key1 ne "N/A" and exists $records2{$key1} and $joinOpt !~ /^N/i) { # defined %{$records2{$key1}} will take new memory
if ($key1 ne "N/A" and exists $records2{$key1} and $joinOpt !~ /^N/i) { # defined %{$records2{$key1}} will take new memory
foreach $lineCount2 (keys %{$records2{$key1}} ) {
#if ($joinOpt =~ /^L/i) {
# print "$line\t---\>\t";
# }else {
# print "$line\t<--\>\t";
# }
print "$line\t$joinLabel\t";
print "$records2{$key1}{$lineCount2}\n";
}
}else {
if ($joinOpt =~ /^L/i) { # left joining: all first file and matched second file
print "$line\t$joinLabel";
for ($tempCount = 0; $tempCount < $maxCol2; $tempCount++) {
print "\t";
}
print "\n";
}elsif ($joinOpt =~ /^N/i ) { # left joining, but only print those not in second file
print "$line\t$joinLabel\n" unless (exists $records2{$key1} ); # defined %{$records2{$key1}} will take new memory
}
}
}
}else {
while (<$FILE1>) {
chomp;
$line = $_;
$lineCount1++;
$file1Header = $_ if ($lineCount1 == $file1HeaderRow );
#if ($separator eq "space") {
# @columns1 = split (/\s+/, $line);
# }elsif ($separator eq "comma" ) {
# @columns1 = split(/\,/, $line);
# }else {
# @columns1 = split (/\t/, $line);
# }
@columns1 = split (/$splitPattern/, $line);
$key1 = "";
foreach $colAI (@file1ColAIs) {
$columns1[$colAI] = $zeroPlus if ($columns1[$colAI] eq "0" ); # so that 0 key will be included
$columns1[$colAI] = "N/A" unless ($columns1[$colAI]);
$key1 .= "$columns1[$colAI]".".";
}
$key1 =~ s/\.$//;
# $columns1[$file1Col] = $zeroPlus if ($columns1[$file1Col] eq "0" ); # so that 0 key will be included
# $columns1[$file1Col] = "N/A" unless ($columns1[$file1Col]);
#if ($columns1[$file1Col] ne "N/A" and defined %{$records2{$columns1[$file1Col]}} and $joinOpt !~ /^N/i) {
$key1 = uc($key1) if ($caseSenstive == 0 );
#if ($file1Header and $lineCount1 == $file1HeaderRow ) {
# if ($joinOpt =~ /^L/i) {
# print "$file1Header\t---\>\t$file2Header\n";
# }else {
# print "$line\t<--\>\t$file2Header\n";
# }
# }elsif ($key1 ne "N/A" and exists $records2{$key1} and $joinOpt !~ /^N/i) { # defined %{$records2{$key1}} will take new memory
if ($key1 ne "N/A" and exists $records2{$key1} and $joinOpt !~ /^N/i) { # defined %{$records2{$key1}} will take new memory
foreach $lineCount2 (keys %{$records2{$key1}} ) {
#if ($joinOpt =~ /^L/i) {
# print "$line\t---\>\t";
#}else {
# print "$line\t<--\>\t";
# }
print "$line\t$joinLabel\t";
print "$records2{$key1}{$lineCount2}\n";
}
}else {
if ($joinOpt =~ /^L/i) { # left joining: all first file and matched second file
print "$line\t$joinLabel";
for ($tempCount = 0; $tempCount < $maxCol2; $tempCount++) {
print "\t";
}
print "\n";
}elsif ($joinOpt =~ /^N/i ) { # left joining, but only print those not in second file
print "$line\t$joinLabel\n" unless (exists $records2{$key1} ); # defined %{$records2{$key1}} will take new memory
}
}
}
}
close ($FILE1);
}
exit;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment