Created
March 18, 2012 05:37
-
-
Save pamag/2069211 to your computer and use it in GitHub Desktop.
converting tped files into HapMap format
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
=head1 [progam_name] | |
description: | |
- converts tped files to hapmap files. | |
=head2 usage | |
# convert tped to HapMap | |
perl /soft_bio/PMG/perl/convert_tped_to_hapmap.pl --tped hapmap3_r2_b36_fwd.consensus.qc.poly--100_inds.tped --tfam hapmap3_r2_b36_fwd.consensus.qc.poly--100_inds.tfam --build=ncbi_36 | |
# See documentation | |
perldoc convert_tped_to_hapmap.pl | |
=head2 comments | |
Pedigree files used by plink (.ped) are individual oriented (individuals in rows) but HapMap are SNP oriented (SNPs in rows). For converting from any SNP oriented format from or to plink, the best approach is to use tped (transposed ped) and then use plink to transpose to/from ped. | |
For converting from ped to tped: | |
- Download 100 individuals from hapampIII in ped format | |
wget -qO- ftp://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/2009-01_phaseIII/plink_format/hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2 | bunzip2 -c | head -n 100 > hapmap3_r2_b36_fwd.consensus.qc.poly--100_inds.ped | |
- Download the map file | |
wget -qO- ftp://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/2009-01_phaseIII/plink_format/hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2 | bunzip2 -c > hapmap3_r2_b36_fwd.consensus.qc.poly--100_inds.map | |
# the .map has the '--100_inds' flag only to have the same base name that the .ped file. | |
- convert to tped | |
plink --noweb --file hapmap3_r2_b36_fwd.consensus.qc.poly--100_inds --recode --transpose --out hapmap3_r2_b36_fwd.consensus.qc.poly--100_inds | |
- convert tped to hapmap | |
perl /soft_bio/PMG/perl/convert_tped_to_hapmap.pl --tped hapmap3_r2_b36_fwd.consensus.qc.poly--100_inds.tped --tfam hapmap3_r2_b36_fwd.consensus.qc.poly--100_inds.tfam --build=ncbi_36 | |
=head2 file type description | |
tped files: | |
- 4 columns (like map) | |
- 5th until end => genotypes (two columns per ind). unknown = 0 or N | |
tfam file | |
Family ID | |
Individual ID | |
Paternal ID | |
Maternal ID | |
Sex (1=male; 2=female; other=unknown) | |
Phenotype | |
map | |
- 4 columns | |
- chr | |
- snp_name | |
- cM | |
- chr_pos | |
hapmap_files | |
-HapMap file format: | |
The current release consists of text-table files only, with the following columns: | |
Col1: refSNP rs# identifier at the time of release (NB: it might merge | |
with another rs# in the future) | |
Col2: SNP alleles according to dbSNP | |
Col3: chromosome that SNP maps to | |
Col4: chromosome position of SNP, in basepairs on reference sequence | |
Col5: strand of reference sequence that SNP maps to | |
Col6: version of reference sequence assembly (currently NCBI build36) | |
Col7: HapMap genotyping center that produced the genotypes | |
Col8: LSID for HapMap protocol used for genotyping | |
Col9: LSID for HapMap assay used for genotyping | |
Col10: LSID for panel of individuals genotyped | |
Col11: QC-code, currently 'QC+' for all entries (for future use) | |
Col12 and on: observed genotypes of samples, one per column, sample | |
identifiers in column headers (Coriell catalog numbers, example: | |
NA10847). | |
=cut | |
#use feature ':5.10'; | |
use strict; | |
#use warnings; | |
#use Data::Dumper; | |
use Getopt::Long; | |
my $prog = $0; | |
my $usage = <<EOQ; | |
Usage for $0: | |
>$prog [-test -help -verbose] --tped xx --tfam yy --hapmap zz --build=ncbi_36 | |
optional | |
-------- | |
--panel=ncbi_36 | |
--strand_all_snps=+ | |
EOQ | |
my $help; | |
my $test; | |
my $debug; | |
my $verbose =1; | |
my $bsub; | |
my $log; | |
my $stdout; | |
my $file_tped; | |
my $file_tfam; | |
my $file_hapmap; | |
my $file_pheno; | |
my $file_map; | |
my $build; | |
my $strand_all_snps; | |
my $panel; | |
my $ok = GetOptions( | |
'test' => \$test, | |
'debug:i' => \$debug, | |
'verbose:i' => \$verbose, | |
'h|help' => \$help, | |
'log' => \$log, | |
'bsub' => \$bsub, | |
'stdout' => \$stdout, | |
'tped=s' => \$file_tped, | |
'tfam=s' => \$file_tfam, | |
'hapmap=s' => \$file_hapmap, | |
'pheno=s' => \$file_pheno, | |
'map=s' => \$file_map, | |
'strand_all_snps=s' => \$strand_all_snps, | |
'build=s' => \$build, | |
'panel=s' => \$panel, | |
); | |
if ($help || !$ok ) { | |
print $usage; | |
exit; | |
} | |
unless ($build) { | |
print "#[ERROR] Sorry but you need to document your hapmap genotypes with a genome build\n"; | |
print $usage; | |
exit; | |
} | |
if (! $file_tped || ! $file_tfam ) { | |
print "#[ERROR] Sorry but you need a tped and tfam file\n"; | |
print $usage; | |
exit; | |
} | |
$file_hapmap ||= $file_tped.'.hapmap'; | |
my $log_file = $file_hapmap.'.log'; | |
# variable to store the log | |
my $log_str; | |
start_log("conversion ped to hapmap"); | |
main(); | |
end_log(); | |
sub main { | |
# read each column from tped and convert to hapmap | |
$DB::single=1; | |
feedback_log("#[MSG] loading tfam file '$file_tfam'\n"); | |
my $tfam = load_tfam($file_tfam); | |
$DB::single=1; | |
open (my $tped_fh, '<', $file_tped) or die "Sorry unable to open tped file '$file_tped' for reading:$!\n"; | |
open (my $hapmap_fh, '>', $file_hapmap) or die "Sorry unable to open hapmap file '$file_hapmap' for writing:$!\n"; | |
# print the header of hapmap | |
$DB::single=1; | |
# expect ind ids to be unique, if you don't whant the id merged with family id | |
# just in case I am joining FID__IID iwth double underscore so it can be easily splitted | |
my $inds = [map {$_->[0].'__'.$_->[1]} @$tfam]; # <TODO> CHECK | |
print {$hapmap_fh} join("\t",qw(rs# alleles chrom pos strand assembly# center protLSID assayLSID panelLSID QCcode), @$inds),"\n"; | |
feedback_log("#[MSG] processing tped file '$file_tped'.\n"); | |
while (my $line=<$tped_fh>) { | |
chomp $line; | |
# print a '.' every 500 SNPs | |
print {*STDERR} '.' unless ($. % 500); | |
my @items = split /\s+/, $line; | |
my $dat= { | |
'01_name' => $items[1], | |
'02_alleles' => get_alleles(\@items), | |
'03_chr' => $items[0], | |
'04_pos' => $items[3], | |
'05_strand' => $strand_all_snps? $strand_all_snps : '.', | |
'06_build' => $build, | |
'07_geno_center' => '-', | |
'08_protocol' => '-', | |
'09_assay' => '-', | |
'10_panel' => $panel? $panel:'-', | |
'11_qc_code' => 'QC+', | |
'12_geno_str' => get_geno_string(\@items, $inds), | |
}; | |
$DB::single=1; | |
print {$hapmap_fh} join ("\t" , map{$dat->{$_}} sort keys %$dat). "\n"; | |
} | |
} | |
=head2 get_geno_string | |
Title : get_geno_string | |
Usage : | |
Function: | |
Example : | |
Returns : | |
Args : | |
=cut | |
sub get_geno_string{ | |
my ($items, $inds) = @_; | |
$DB::single=1; | |
my @genos = @$items[4..(@$items-1)]; | |
if (@genos % 2 ) { | |
# more columns that expected | |
die "#[ERROR] columns not multiplo of 2 in parsed ped file ' $file_tped' at line $."; | |
} | |
if (@genos/2 != @$inds) { | |
# error more geno than inds | |
die "#[ERROR] different number of individuals and genotypes in parsed ped file ' $file_tped' at line $.: genos=".(@genos/2)." inds=".@$inds."\n"; | |
} | |
return join ("\t", map { | |
my $idx=$_*2; | |
$genos[$idx+0].$genos[$idx+1] | |
} 0..((@genos/2)-1) | |
) | |
} | |
=head2 get_alleles | |
Title : get_alleles | |
Usage : | |
Function: | |
Example : | |
Returns : | |
Args : | |
=cut | |
sub get_alleles{ | |
my ($items) = @_; | |
my %h; | |
map{$h{$_}++} @$items[4..(@$items-1)]; | |
return join('/',sort keys %h); | |
} | |
=head2 write_log | |
Title : write_log | |
Usage : | |
Function: | |
Example : | |
Returns : | |
Args : $log_str str with the message to log | |
=cut | |
sub write_log{ | |
my ($log_str) = @_; | |
write_file($log_file, $log_str); | |
} | |
=head2 feedback_log | |
Title : feedback_log | |
Usage : | |
Function: print to stderr and add tothe log_str the message. (The str should contain the \n if needed) | |
Example : | |
Returns : | |
Args : $str = string | |
=cut | |
sub feedback_log{ | |
my ($str) = @_; | |
print {*STDERR} $str; | |
$log_str .= $str; | |
} | |
=head2 start_log | |
Title : start_log | |
Usage : | |
Function: start log file with the name of the files | |
Example : | |
Returns : | |
Args : an extra message to add to the log starting | |
=cut | |
sub start_log{ | |
my ($title, $extra_msg) = @_; | |
my $date = localtime; | |
$title ||= "$0"; | |
my $out = "== LOG $title ==\n"; | |
$out .= 'Started at: ' . $date . "\n"; | |
$out .= " hapmap file : " . $file_hapmap . "\n"; | |
$out .= " tped file : " . $file_tped . "\n"; | |
$out .= " tfam file : " . $file_tfam . "\n"; | |
$out .= " map file : " . $file_map . "\n"; | |
$out .= " pheno file : " . $file_pheno . "\n"; | |
$out .= "\n"; | |
$out .= $extra_msg; | |
feedback_log($out); | |
} | |
=head2 end_log | |
Title : end_log | |
Usage : | |
Function: | |
Example : | |
Returns : | |
Args : | |
=cut | |
sub end_log{ | |
my ($extra_msg) = @_; | |
my $date = localtime; | |
my $out = 'ENDED at: ' . $date . "\n"; | |
$out .= $extra_msg; | |
$out .= "== END ==\n"; | |
feedback_log($out); | |
} | |
=head2 load_pheno_file | |
Title : load_pheno_file | |
Usage : | |
Function: | |
Example : | |
Returns : a hash with the IID as key and values. The two first columns should be FID and IID and the third is 'sex' | |
Args : | |
=cut | |
sub load_pheno_file{ | |
my ($file) = @_; | |
my $pheno_hash = read_table_to_hash($file); | |
return $pheno_hash; | |
} | |
=head2 load_tfam | |
Title : load_tfam | |
Usage : | |
Function: | |
Example : | |
Returns : | |
Args : | |
=cut | |
sub load_tfam{ | |
my ($file_tfam) = @_; | |
my $rows = read_table($file_tfam, 0); | |
return $rows; | |
} | |
=head2 read_table | |
Title : read_table | |
Usage : | |
Function: parse files separated with tabs and bare columns (no quotes nor tabs to scape) | |
Example : | |
Returns : | |
Args : $file | |
$has_header | |
<TO_DO> think if we want to capture headers and return them?? | |
=cut | |
sub read_table{ | |
my ($file, $has_header) = @_; | |
die "#[ERROR] no argument passed to sub read_table\n" unless $file; | |
unless (-e $file) { | |
die "#[ERROR] Sorry but the file '$file' does not exist.\n"; | |
} | |
open (my $filefh, '<',$file) or die "#[ERROR] unable to open file '$file' for reading:$!\n"; | |
my $count; | |
my @rows; | |
while( my $line=<$filefh>) { | |
next if $.==1 && $has_header; | |
chomp $line; | |
$count ++; | |
my @items = split /\s+/, $line; | |
push @rows, [@items]; | |
} | |
feedback_log( "...end. $count lines processed\n"); | |
return \@rows; | |
} | |
=head2 read_table_to_hash | |
Title : read_table_to_hash | |
Usage : | |
Function: creates a hash of hashes. A hash with keys the first column | |
and values being another hash constructed with all columns ad keys | |
taken from the header | |
Example : | |
Returns : | |
Args : | |
=cut | |
sub read_table_to_hash{ | |
my ($file) = @_; | |
# a header is mandatory | |
die "#[ERROR] no argument passed to sub read_table\n" unless $file; | |
unless (-e $file) { | |
die "#[ERROR] Sorry but the file '$file' does not exist.\n"; | |
} | |
open (my $filefh, '<',$file) or die "unable to open $file_map for reading:$!\n"; | |
my @headers; | |
my $count; | |
my %hash; | |
my $hc; | |
while (my $line=<$filefh>) { | |
chomp $line; | |
$count ++; | |
my @items = split "\t", $line; | |
if ($.==1){ | |
@headers = @items; | |
$hc = @headers; | |
} | |
else { | |
my $rc = @items; | |
die "#[ERROR] Sorry but line '$.' has wrong number of columns. header=$hc, this_row=$rc\n" unless $hc==$rc; | |
@{$hash{$items[0]}}{map{lc}@headers}=@items; | |
} | |
} | |
feedback_log( ".. $count lines processed for $file\n"); | |
return \%hash; | |
} | |
=head2 final_qc | |
Title : final_qc | |
Usage : | |
Function: | |
Example : | |
Returns : | |
Args : | |
=cut | |
sub final_qc{ | |
my ($ind_ids, $pheno_hash, $map_hash) = @_; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment