Created
November 19, 2013 21:13
-
-
Save standage/7552696 to your computer and use it in GitHub Desktop.
This script is motivated by a task that commonly comes up in my day-to-day research--a task that more often than not I code from scratch each time. The details vary, but the basics are the same. I have a table or mapping of important data, and I want to use a list of identifiers to pull out a subset of entries from that table or mapping. I've wr…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env perl | |
| use strict; | |
| use warnings; | |
| use Getopt::Long; | |
| # Copyright (c) 2013, Daniel S. Standage <daniel.standage@gmail.com> | |
| # | |
| # Permission to use, copy, modify, and/or distribute this software for any | |
| # purpose with or without fee is hereby granted, provided that the above | |
| # copyright notice and this permission notice appear in all copies. | |
| # | |
| # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | |
| # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | |
| # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | |
| # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | |
| # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | |
| # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | |
| # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |
| # ------------------------------------------------------------------------------ | |
| # This script is motivated by a task that commonly comes up in my day-to-day | |
| # research--a task that more often than not I code from scratch each time. The | |
| # details vary, but the basics are the same. I have a table or mapping of | |
| # important data, and I want to use a list of identifiers to pull out a subset | |
| # of entries from that table or mapping. I've written this script with the hope | |
| # that I can solve the general problem once and for all. | |
| sub print_usage | |
| { | |
| my $outstream = shift(@_); | |
| print $outstream " | |
| selex: pull out needles in your tabular haystack! | |
| Input: a 'needles' file and a 'haystack' file. The 'needles' file should | |
| contain a list of values (one per line) corresponding to entries to | |
| be extracted from the 'haystack' file. The 'haystack' file should be a | |
| tab or comma delimited file containing the data of interest. | |
| Output: entries (or specific values) from the 'haystack' file | |
| Usage: selex [options] needles haystack | |
| Options: | |
| -d|--delim: CHAR delimiter for 'haystacks' file; default is tab | |
| -h|--help print this help message and exit | |
| -k|--hkey: INT field from the 'haystack' file that should be used to | |
| match the key from the 'needles' file; default is 0 | |
| -m|--ndelim: CHAR delimiter for 'needles' file (if there are multiple | |
| values per line); default is tab | |
| -n|--nkey: INT if there are multiple values per line in the 'needles' | |
| file, this file refers to the field that should be used | |
| to match the key from the 'haystack' file; default is 0 | |
| -o|--out: INT comma-separated list of fields to print from each | |
| matched haystack entry; -1 prints the entire line; | |
| default is -1 | |
| "; | |
| } | |
| # Parse command line options | |
| my $hdelim = "\t"; | |
| my $hkey = 0; | |
| my $ndelim = "\t"; | |
| my $nkey = 0; | |
| my $outstr = "-1"; | |
| GetOptions | |
| ( | |
| "d|delim=s" => \$hdelim, | |
| "h|help" => sub { print_usage(\*STDOUT); exit(0); }, | |
| "k|hkey=i" => \$hkey, | |
| "m|ndelim=s" => \$ndelim, | |
| "n|nkey=i" => \$nkey, | |
| "o|out=s" => \$outstr, | |
| ); | |
| my $needlesfile = shift(@ARGV) or do | |
| { | |
| printf(STDERR "error: please provide 'needles' file"); | |
| print_usage(\*STDERR); | |
| exit(1); | |
| }; | |
| my $haystackfile = shift(@ARGV) or do | |
| { | |
| printf(STDERR "error: please provide 'haystack' file"); | |
| print_usage(\*STDERR); | |
| exit(1); | |
| }; | |
| my @outfields = split(/,/, $outstr); | |
| # Load needles into an associative array (for quick lookup) | |
| my %needles; | |
| open(my $NDL, "<", $needlesfile) or die("error opening file '$needlesfile'"); | |
| while(my $line = <$NDL>) | |
| { | |
| chomp($line); | |
| my @fields = split(/$ndelim/, $line); | |
| my $needle = $fields[$nkey]; | |
| $needles{ $needle } = 1; | |
| } | |
| close($NDL); | |
| #use Data::Dumper; | |
| #print Dumper \%needles; | |
| # Iterate through haystack, print entries whose keys match a needle | |
| open(my $HAY, "<", $haystackfile) or die("error opening file '$haystackfile'"); | |
| while(my $line = <$HAY>) | |
| { | |
| chomp($line); | |
| my @fields = split(/$hdelim/, $line); | |
| my $keyvalue = $fields[$hkey]; | |
| if($needles{ $keyvalue }) | |
| { | |
| if($outstr eq "-1") | |
| { | |
| print $line, "\n"; | |
| } | |
| else | |
| { | |
| my @values = @fields[@outfields]; | |
| print join("\t", @values), "\n"; | |
| } | |
| } | |
| } | |
| close($HAY); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment