Created
June 15, 2023 09:16
-
-
Save nylander/287d1f47c669a350c2e7b97a3da58df5 to your computer and use it in GitHub Desktop.
Create a sample sheet from NGI (SciLifeLab) sequencing delivery
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
=pod | |
=encoding utf8 | |
=head1 NAME | |
create_sample_sheet.pl - Create sample sheet from NGI delivery | |
=head1 SYNOPSIS | |
$ create_sample_sheet.pl 00-Reports/S.OmeName_22_01_sample_info.txt | |
=head1 DESCRIPTION | |
Change directory to main delivery directory (e.g. "cd P27213"). Assuming there | |
is a column with user defined IDs in the file "00-Reports/*_sample_info.txt" | |
(tab-separated columns), the script will read that file (and column nrs 1 and | |
2), and then locate corresponding .fastq.gz files. | |
Prints tab separated entries to STDOUT. | |
Note: currently assumes PE libraries. | |
=head1 OPTIONS | |
=over 4 | |
=item B<-a,--atlas> Output table for the ATLAS workflow (see L<https://github.com/metagenome-atlas/atlas>). | |
=item B<-e,--eager> Output table for the nf-core/eager workflow (See L<https://nf-co.re/eager/2.4.7/usage#tsv-input-method>). | |
=item B<-h,--help> Display help. | |
=item B<-v,--version> Display version. | |
=back | |
=head1 AUTHOR | |
Johan Nylander <[email protected]> | |
=head1 VERSION | |
0.2 | |
=head1 COPYRIGHT AND LICENSE | |
Copyright 2023 Johan Nylander. | |
Distributed under the MIT License. | |
=cut | |
use strict; | |
use warnings; | |
use File::Find; | |
use File::Basename; | |
use Cwd; | |
use Data::Dumper; | |
use Getopt::Long; | |
Getopt::Long::Configure("no_ignore_case", "no_auto_abbrev"); | |
my $version = '0.2'; | |
# For eager tsv format settings, see https://nf-co.re/eager/2.4.7/usage#tsv-input-method | |
my $eager_colour_chemistry = '2'; | |
my $eager_seqtype = 'PE'; | |
my $eager_organism = 'NA'; | |
my $eager_strandedness = 'double'; | |
my $eager_udg_treatment = 'full'; | |
my $eager_bam = 'NA'; | |
my $atlas = q{}; | |
my $eager = q{}; | |
my %HoH = (); | |
my @uid_order = (); | |
my @ngiid_order = (); | |
my @file_paths = (); | |
my $cwd = getcwd(); | |
GetOptions( | |
'a|atlas' => \$atlas, | |
'e|eager' => \$eager, | |
'v|version' => sub { print "$version\n"; exit(0); }, | |
'h' => sub { print "Usage: $0 [OPTIONS][--help] sample_info_file\n"; exit(0); }, | |
'help' => sub { exec("perldoc", $0); exit(0); }, | |
) or | |
die ("$0 Error in command line arguments\nUsage: $0 [OPTIONS][--help] sample_info_file\n"); | |
if (@ARGV == 0 && -t STDIN && -t STDERR) { | |
print "Usage: $0 [OPTIONS][--help] sample_info_file\n"; | |
exit(1); | |
} | |
my $sample_info_file = shift or die "Error: need sample file as argument\n"; | |
sub find_files { | |
return unless -f and /^.*\.fastq\.gz\z/s; | |
my $file = $File::Find::name; | |
push @file_paths, $file; | |
} | |
open (my $SAMPLE, "<", $sample_info_file) or die "Error: could not open sample file"; | |
while (<$SAMPLE>) { | |
chomp; | |
next if (/^NGI ID/); | |
my ($ngiid, $userid, @rest) = split /\t/; | |
push (@uid_order, $userid); | |
push (@ngiid_order, $ngiid); | |
$HoH{$ngiid}{'uid'} = $userid; | |
$HoH{$ngiid}{'ngiid'} = $ngiid; | |
} | |
close($SAMPLE); | |
find(\&find_files, $cwd); | |
for my $file_path (@file_paths) { | |
for my $id (@ngiid_order) { | |
if ($file_path =~ /$id/) { | |
if ($file_path =~/R1_001\.fastq\.gz/) { | |
$HoH{$id}{'R1'} = $file_path; | |
} | |
elsif ($file_path =~/R2_001\.fastq\.gz/) { | |
$HoH{$id}{'R2'} = $file_path; | |
} | |
} | |
else { | |
print STDERR "Warning: could not find fastq.gz file for ID $id\n"; | |
} | |
} | |
} | |
if ($eager) { | |
# See https://nf-co.re/eager/2.4.7/usage#tsv-input-method | |
print STDOUT "Sample_Name\tLibrary_ID\tLane\tColour_Chemistry\tSeqType\tOrganism\tStrandedness\tUDG_Treatment\tR1\tR2\tBAM\n"; | |
for my $id (@ngiid_order) { | |
if (exists $HoH{$id}{'R1'} && exists $HoH{$id}{'R2'}) { | |
print STDOUT $HoH{$id}{'uid'}, "\t"; | |
print STDOUT $HoH{$id}{'ngiid'}, "\t"; | |
my ($filename, $dirs, $suffix) = fileparse($HoH{$id}{'R1'}, '_R1_001.fastq.gz'); | |
my $lane = ''; | |
if ($filename =~ /.+_L(\d+)$/) { | |
$lane = $1; | |
} | |
print STDOUT $lane, "\t"; | |
print STDOUT $eager_colour_chemistry, "\t"; | |
print STDOUT $eager_seqtype, "\t"; | |
print STDOUT $eager_organism, "\t"; | |
print STDOUT $eager_strandedness, "\t"; | |
print STDOUT $eager_udg_treatment, "\t"; | |
print STDOUT $HoH{$id}{'R1'}, "\t"; | |
print STDOUT $HoH{$id}{'R2'}, "\t"; | |
print STDOUT $eager_bam, "\n"; | |
} | |
} | |
} | |
elsif ($atlas) { | |
print STDOUT "Sample\tLib\tFile\tRead\tPath\n"; | |
for my $id (@ngiid_order) { | |
print STDOUT $HoH{$id}{'uid'}, "\t"; | |
my ($filename, $dirs, $suffix) = fileparse($HoH{$id}{'R1'}, '_R1_001.fastq.gz'); | |
my @d = split /\//, $dirs; | |
my $lib = pop(@d); | |
print STDOUT $lib, "\t"; | |
print STDOUT $filename, "\t"; | |
print STDOUT "R1", "\t"; | |
print STDOUT $dirs, "\n"; | |
print STDOUT $HoH{$id}{'uid'}, "\t"; | |
($filename, $dirs, $suffix) = fileparse($HoH{$id}{'R2'}, '_R2_001.fastq.gz'); | |
@d = split /\//, $dirs; | |
$lib = pop(@d); | |
print STDOUT $lib, "\t"; | |
print STDOUT $filename, "\t"; | |
print STDOUT "R2", "\t"; | |
print STDOUT $dirs, "\n"; | |
} | |
} | |
else { | |
for my $id (@ngiid_order) { | |
if (exists $HoH{$id}{'R1'} && exists $HoH{$id}{'R2'}) { | |
#print $HoH{$id}{'uid'}, "\t", $HoH{$id}{'ngiid'}, "\t", $HoH{$id}{'R2'}, "\t", $HoH{$id}{'R2'}, "\n"; | |
print $HoH{$id}{'ngiid'}, "\t", $HoH{$id}{'uid'}, "\t", $HoH{$id}{'R1'}, "\t", $HoH{$id}{'R2'}, "\n"; | |
} | |
} | |
} | |
__END__ | |
# %HoH = ( | |
# 'P27213_116' => { | |
# 'uid' => 'PS-B6', | |
# 'ngiid' => 'P27213_116', | |
# 'R1' => '/proj/nrmdnalab_storage/Pleurozium_NV/ngisthlm00062/files/P27213/P27213_116/02-FASTQ/221014_A01901_0056_AHNLGHDSX3/P27213_116_S68_L002_R1_001.fastq.gz', | |
# 'R2' => '/proj/nrmdnalab_storage/Pleurozium_NV/ngisthlm00062/files/P27213/P27213_116/02-FASTQ/221014_A01901_0056_AHNLGHDSX3/P27213_116_S68_L002_R2_001.fastq.gz', | |
# }, | |
# ); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment