Created
October 11, 2012 17:39
-
-
Save sashaphanes/3874181 to your computer and use it in GitHub Desktop.
tableMerger
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
require "getopts.pl"; | |
use vars qw ($opt_f $opt_p $opt_q $opt_s $opt_t $opt_l $opt_m $opt_F $opt_Q $opt_L $opt_M $opt_c $opt_o $opt_h $opt_H); | |
&Getopts('f:p:q:l:s:t:m:F:Q:L:M:c:o:hH:'); | |
my $executable = "merge.basedOn.column.pl"; | |
$| = 1; | |
my $usage = qq{ | |
################################################################### | |
usage: $executable -f file1 -q file2 -l file1_column -m file2_column [-o joining_option] > output | |
(used for merge two flat files like in a database) | |
# -f first file (tab separator for each column); use stdin if the input file not specified | |
# -p a list file with multiple first files (-f). Each file in one line; all must be in the same format | |
# | |
# -q second file (tab separator for each column) | |
# | |
# -l N:the joining column number of the first file | |
# if multiple columns use like 1,2,3 | |
# | |
# -L ignore char in columns specified in -l: | |
# if specified, such as "chr", it will be ignored in the combined string from -l | |
# and not affect the joining | |
# | |
# -m N:the joining column number of the second file | |
# if multiple columns use like 1,2,3 | |
# | |
# -M ignore char in columns specified in -M: | |
# if specified, such as "chr", it will be ignored in the combined string from -m | |
# and not affect the joining | |
# | |
# -F the row number of file header in -f [default: 1] (0: no header) | |
# -Q the row number of file header in -q [default: 1] (0: no header) | |
# | |
# -c case senstive for the joining data | |
# Y: yes [default] | |
# N: no | |
# | |
# -o joining option: | |
# L: left joining: all first file and matched second file | |
# default (not specified): equal joining: only matched for both | |
# N: left joining, but exclude those records matched in second file | |
# | |
# -s field separator: | |
# default: <tab> | |
# S: space+ | |
# C: comma | |
# | |
# -t field separator for file2 [default as in -s] | |
# T: tab | |
# S: space+ | |
# C: comma | |
# | |
# -hH print this | |
#################################################################### | |
\n}; | |
my ($file1, $file1ListName, %file1List, $file2, $file1Col, $file2Col, $joinOpt, $separator, $file2Separator, $caseSenstive, $file1HeaderRow,$file2HeaderRow, $ignoreL, $ignoreM, $splitPattern, $split2Pattern); | |
# options: | |
if (length($opt_h) > 0 || length($opt_H) > 0 ) {die($usage);} | |
if (-s $opt_f or -s $opt_p) { | |
$file1List{$opt_f} = 1 if ($opt_f); | |
if (-s $opt_p) { | |
$file1ListName = $opt_p; | |
if ($file1ListName =~ /\.gz$/i) { | |
open (LIST, "zcat $file1ListName |") ||die ("Cant open $file1ListName!\n"); | |
}else { | |
open (LIST, "$file1ListName") ||die ("Cant open $file1ListName!\n"); | |
} | |
while (<LIST>) { | |
chomp; | |
$file1 = $_; | |
$file1List{$file1} = 1 if (-s $file1); | |
} | |
close (LIST); | |
} | |
} else { | |
#die($usage); | |
$file1List{"stdin"} = 1; | |
} | |
if (length($opt_q) > 0) {$file2 = $opt_q;} else {die($usage);} | |
#if (length($opt_l) > 0) {$file1Col = $opt_l - 1;} else {die($usage);} | |
#if (length($opt_m) > 0) {$file2Col = $opt_m - 1;} else {die($usage);} | |
if (length($opt_l) > 0) {$file1Col = $opt_l ;} else {die($usage);} | |
if (length($opt_m) > 0) {$file2Col = $opt_m ;} else {die($usage);} | |
if (length($opt_F) > 0 and $opt_F >= 0 ) {$file1HeaderRow = $opt_F; } else {$file1HeaderRow = 1; } | |
if (length($opt_Q) > 0 and $opt_Q >= 0 ) {$file2HeaderRow = $opt_Q; } else {$file2HeaderRow = 1; } | |
if (length($opt_L) > 0 ) {$ignoreL = $opt_L; } | |
if (length($opt_M) > 0 ) {$ignoreM = $opt_M; } | |
if (length($opt_o) > 0) {$joinOpt = $opt_o;} | |
if (length($opt_s) > 0 and $opt_s =~ /^s/i) { | |
$separator ="space"; | |
$splitPattern ="\\s\+"; | |
}elsif ($opt_s =~ /^C/i) { | |
$separator ="comma"; | |
$splitPattern ="\\,"; | |
}else { | |
$separator ="tab"; | |
$splitPattern ="\\t"; | |
} | |
if ($opt_t =~ /^S/i) { | |
$file2Separator = "space"; | |
$split2Pattern ="\\s\+"; | |
}elsif ($opt_t =~ /^C/i ) { | |
$file2Separator = "comma"; | |
$split2Pattern ="\\,"; | |
}elsif ($opt_t =~ /^T/i ) { | |
$file2Separator = "tab"; | |
$split2Pattern ="\\t"; | |
}else { | |
$file2Separator = $separator; | |
$split2Pattern = $splitPattern; | |
} | |
if ($opt_c =~ /^N/i) {$caseSenstive = 0; } else {$caseSenstive = 1; } | |
my $zeroPlus = "0plus"; | |
my $lineCount2 = 0; | |
my $lineCount1 = 0; | |
my ($line); | |
my (@columns1, $tempCount); | |
if ($file2 =~ /\.gz$/i) { | |
open (FILE2, "zcat $file2 |") ||die ("Cant open $file2!\n"); | |
}else { | |
open (FILE2, "$file2") ||die ("Cant open $file2!\n"); | |
} | |
my (%records2, @columns2, $maxCol2, $colNo2); | |
my ($key1, $key2, $colNm, $colAI, @file1Cols, @file2Cols,@file1ColAIs, @file2ColAIs, $file1ColsNm, $file2ColsNm); | |
@file1Cols = split (/\,/, $file1Col); | |
@file2Cols = split (/\,/, $file2Col); | |
$file1ColsNm = @file1Cols; | |
$file2ColsNm = @file2Cols; | |
unless ($file1ColsNm == $file2ColsNm) { | |
die("Error: need same total column numbers specified in -l and -m\n $usage"); | |
} | |
foreach $colNm (@file1Cols ) { | |
die("Error: column numbers must >0 in -m\n $usage") if ($colNm <1); | |
$colAI = $colNm - 1; | |
push (@file1ColAIs, $colAI); | |
} | |
foreach $colNm (@file2Cols ) { | |
die("Error: column numbers must >0 in -m\n $usage") if ($colNm <1); | |
$colAI = $colNm - 1; | |
push (@file2ColAIs, $colAI); | |
} | |
my ($file1Header, $file2Header); | |
if ($file2HeaderRow > 0 ) { | |
for ($lineCount2 = 1; $lineCount2 <= $file2HeaderRow; $lineCount2++) { | |
$file2Header = <FILE2>; | |
} | |
chomp ($file2Header); | |
} | |
if ($ignoreM) { | |
while (<FILE2>) { | |
chomp; | |
$line = $_; | |
$lineCount2++; | |
#$file2Header = $_ if ($lineCount2 == $file2HeaderRow ); | |
#if ($file2Separator eq "space") { | |
# @columns2 = split (/\s+/, $line); | |
# }elsif ($file2Separator eq "comma" ) { | |
# @columns2 = split(/\,/, $line); | |
# }else { | |
# @columns2 = split (/\t/, $line); | |
# } | |
@columns2 = split (/$split2Pattern/, $line); | |
$colNo2 = @columns2; | |
$maxCol2 = $colNo2 if ($colNo2 > $maxCol2); | |
$key2 = ""; | |
foreach $colAI (@file2ColAIs) { | |
$columns2[$colAI] = $zeroPlus if ($columns2[$colAI] eq "0" ); # so that 0 key will be included | |
$columns2[$colAI] = "N/A" unless ($columns2[$colAI]); | |
$key2 .= "$columns2[$colAI]"."."; | |
} | |
# $columns2[$file2Col] = $zeroPlus if ($columns2[$file2Col] eq "0" ); # so that 0 key will be included | |
# $columns2[$file2Col] = "N/A" unless ($columns2[$file2Col]); | |
$key2 =~ s/\.$//; | |
#$records2{$columns2[$file2Col]}{$lineCount2} = $line; | |
$key2 =~ s/$ignoreM//g; | |
$key2 = uc($key2) if ($caseSenstive == 0 ); | |
$records2{$key2}{$lineCount2} = $line; | |
} | |
}else { | |
while (<FILE2>) { | |
chomp; | |
$line = $_; | |
$lineCount2++; | |
#$file2Header = $_ if ($lineCount2 == $file2HeaderRow ); | |
# if ($file2Separator eq "space") { | |
# @columns2 = split (/\s+/, $line); | |
# }elsif ($file2Separator eq "comma" ) { | |
# @columns2 = split(/\,/, $line); | |
# }else { | |
# @columns2 = split (/\t/, $line); | |
# } | |
@columns2 = split (/$split2Pattern/, $line); | |
$colNo2 = @columns2; | |
$maxCol2 = $colNo2 if ($colNo2 > $maxCol2); | |
$key2 = ""; | |
foreach $colAI (@file2ColAIs) { | |
$columns2[$colAI] = $zeroPlus if ($columns2[$colAI] eq "0" ); # so that 0 key will be included | |
$columns2[$colAI] = "N/A" unless ($columns2[$colAI]); | |
$key2 .= "$columns2[$colAI]"."."; | |
} | |
# $columns2[$file2Col] = $zeroPlus if ($columns2[$file2Col] eq "0" ); # so that 0 key will be included | |
# $columns2[$file2Col] = "N/A" unless ($columns2[$file2Col]); | |
$key2 =~ s/\.$//; | |
#$records2{$columns2[$file2Col]}{$lineCount2} = $line; | |
$key2 = uc($key2) if ($caseSenstive == 0 ); | |
$records2{$key2}{$lineCount2} = $line; | |
} | |
} | |
close (FILE2); | |
my ($file1Count, $joinLabel); | |
if ($joinOpt =~ /^L/i) { | |
$joinLabel = "--->"; | |
}elsif ($joinOpt =~ /^N/i ) { | |
$joinLabel = "<---"; | |
}else { | |
$joinLabel = "<-->"; | |
} | |
my $FILE1; | |
foreach $file1 (sort keys %file1List) { | |
$file1Count++; | |
if ($file1 eq "stdin") { | |
$FILE1 = \*STDIN; | |
}elsif ($file1 =~ /\.gz$/i) { | |
open ($FILE1, "zcat $file1 |") || die ("Cant open $file1!\n");; | |
}else { | |
open ($FILE1, "$file1") ||die ("Cant open $file1!\n"); | |
} | |
if ($file1HeaderRow > 0 ) { | |
for ($lineCount1 = 1; $lineCount1 <= $file1HeaderRow; $lineCount1++) { | |
$file1Header = <$FILE1>; | |
} | |
chomp ($file1Header); | |
if ($file1Count == 1 and $file1Header ) { | |
#if ($joinOpt =~ /^L/i) { | |
# print "$file1Header\t---\>\t$file2Header\n"; | |
#}else { | |
# print "$line\t<--\>\t$file2Header\n"; | |
#} | |
print "$file1Header\t$joinLabel\t$file2Header\n"; | |
} | |
} | |
if ($ignoreL) { | |
while (<$FILE1>) { | |
chomp; | |
$line = $_; | |
$lineCount1++; | |
# $file1Header = $_ if ($lineCount1 == $file1HeaderRow ); | |
# if ($separator eq "space") { | |
# @columns1 = split (/\s+/, $line); | |
# }elsif ($separator eq "comma" ) { | |
# @columns1 = split(/\,/, $line); | |
# }else { | |
# @columns1 = split (/\t/, $line); | |
# } | |
@columns1 = split (/$splitPattern/, $line); | |
$key1 = ""; | |
foreach $colAI (@file1ColAIs) { | |
$columns1[$colAI] = $zeroPlus if ($columns1[$colAI] eq "0" ); # so that 0 key will be included | |
$columns1[$colAI] = "N/A" unless ($columns1[$colAI]); | |
$key1 .= "$columns1[$colAI]"."."; | |
} | |
$key1 =~ s/\.$//; | |
$key1 =~ s/$ignoreL//g; | |
# $columns1[$file1Col] = $zeroPlus if ($columns1[$file1Col] eq "0" ); # so that 0 key will be included | |
# $columns1[$file1Col] = "N/A" unless ($columns1[$file1Col]); | |
#if ($columns1[$file1Col] ne "N/A" and defined %{$records2{$columns1[$file1Col]}} and $joinOpt !~ /^N/i) { | |
$key1 = uc($key1) if ($caseSenstive == 0 ); | |
#if ($file1Header and $lineCount1 == $file1HeaderRow ) { | |
# if ($joinOpt =~ /^L/i) { | |
# print "$file1Header\t---\>\t$file2Header\n"; | |
# }else { | |
# print "$line\t<--\>\t$file2Header\n"; | |
# } | |
#}elsif ($key1 ne "N/A" and exists $records2{$key1} and $joinOpt !~ /^N/i) { # defined %{$records2{$key1}} will take new memory | |
if ($key1 ne "N/A" and exists $records2{$key1} and $joinOpt !~ /^N/i) { # defined %{$records2{$key1}} will take new memory | |
foreach $lineCount2 (keys %{$records2{$key1}} ) { | |
#if ($joinOpt =~ /^L/i) { | |
# print "$line\t---\>\t"; | |
# }else { | |
# print "$line\t<--\>\t"; | |
# } | |
print "$line\t$joinLabel\t"; | |
print "$records2{$key1}{$lineCount2}\n"; | |
} | |
}else { | |
if ($joinOpt =~ /^L/i) { # left joining: all first file and matched second file | |
print "$line\t$joinLabel"; | |
for ($tempCount = 0; $tempCount < $maxCol2; $tempCount++) { | |
print "\t"; | |
} | |
print "\n"; | |
}elsif ($joinOpt =~ /^N/i ) { # left joining, but only print those not in second file | |
print "$line\t$joinLabel\n" unless (exists $records2{$key1} ); # defined %{$records2{$key1}} will take new memory | |
} | |
} | |
} | |
}else { | |
while (<$FILE1>) { | |
chomp; | |
$line = $_; | |
$lineCount1++; | |
$file1Header = $_ if ($lineCount1 == $file1HeaderRow ); | |
#if ($separator eq "space") { | |
# @columns1 = split (/\s+/, $line); | |
# }elsif ($separator eq "comma" ) { | |
# @columns1 = split(/\,/, $line); | |
# }else { | |
# @columns1 = split (/\t/, $line); | |
# } | |
@columns1 = split (/$splitPattern/, $line); | |
$key1 = ""; | |
foreach $colAI (@file1ColAIs) { | |
$columns1[$colAI] = $zeroPlus if ($columns1[$colAI] eq "0" ); # so that 0 key will be included | |
$columns1[$colAI] = "N/A" unless ($columns1[$colAI]); | |
$key1 .= "$columns1[$colAI]"."."; | |
} | |
$key1 =~ s/\.$//; | |
# $columns1[$file1Col] = $zeroPlus if ($columns1[$file1Col] eq "0" ); # so that 0 key will be included | |
# $columns1[$file1Col] = "N/A" unless ($columns1[$file1Col]); | |
#if ($columns1[$file1Col] ne "N/A" and defined %{$records2{$columns1[$file1Col]}} and $joinOpt !~ /^N/i) { | |
$key1 = uc($key1) if ($caseSenstive == 0 ); | |
#if ($file1Header and $lineCount1 == $file1HeaderRow ) { | |
# if ($joinOpt =~ /^L/i) { | |
# print "$file1Header\t---\>\t$file2Header\n"; | |
# }else { | |
# print "$line\t<--\>\t$file2Header\n"; | |
# } | |
# }elsif ($key1 ne "N/A" and exists $records2{$key1} and $joinOpt !~ /^N/i) { # defined %{$records2{$key1}} will take new memory | |
if ($key1 ne "N/A" and exists $records2{$key1} and $joinOpt !~ /^N/i) { # defined %{$records2{$key1}} will take new memory | |
foreach $lineCount2 (keys %{$records2{$key1}} ) { | |
#if ($joinOpt =~ /^L/i) { | |
# print "$line\t---\>\t"; | |
#}else { | |
# print "$line\t<--\>\t"; | |
# } | |
print "$line\t$joinLabel\t"; | |
print "$records2{$key1}{$lineCount2}\n"; | |
} | |
}else { | |
if ($joinOpt =~ /^L/i) { # left joining: all first file and matched second file | |
print "$line\t$joinLabel"; | |
for ($tempCount = 0; $tempCount < $maxCol2; $tempCount++) { | |
print "\t"; | |
} | |
print "\n"; | |
}elsif ($joinOpt =~ /^N/i ) { # left joining, but only print those not in second file | |
print "$line\t$joinLabel\n" unless (exists $records2{$key1} ); # defined %{$records2{$key1}} will take new memory | |
} | |
} | |
} | |
} | |
close ($FILE1); | |
} | |
exit; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment