Created
August 27, 2013 14:26
-
-
Save avrilcoghlan/6354213 to your computer and use it in GitHub Desktop.
Perl script that, given an input gff file, identifies exons that have a 0 bp intron between them, or are overlapping exons, and merges them.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
=head1 NAME | |
merge_overlapping_exons.pl | |
=head1 SYNOPSIS | |
merge_overlapping_exons.pl input_gff output_gff outputdir input_fasta | |
where input_gff is the input gff file, | |
output_gff is the output gff file, | |
outputdir is the directory to write output files in, | |
input_fasta is the input fasta file for the assembly. | |
=head1 DESCRIPTION | |
This script takes an input gff file, and identifies exons that have a 0 bp | |
intron between them (ie. are adjacent exons) or are overlapping exons, and merges | |
them. The corrected exons are written to an output gff file. | |
=head1 VERSION | |
Perl script last edited 22-Jul-2013. | |
=head1 CONTACT | |
[email protected] (Avril Coghlan) | |
=cut | |
# | |
# Perl script merge_overlapping_exons.pl | |
# Written by Avril Coghlan ([email protected]) | |
# 22-Jul-13. | |
# Last edited 22-Jul-2013. | |
# SCRIPT SYNOPSIS: merge_overlapping_exons.pl: given an input gff file, identifies exons that have a 0 bp intron between them, or are overlapping exons, and merges them. | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
use strict; | |
use warnings; | |
# xxx | |
BEGIN { | |
unshift (@INC, '/nfs/users/nfs_a/alc/Documents/git/helminth_scripts/modules'); | |
} | |
use HelminthGenomeAnalysis::AvrilGffUtils; | |
use HelminthGenomeAnalysis::AvrilFileUtils; | |
use HelminthGenomeAnalysis::AvrilFastaUtils; | |
my $num_args = $#ARGV + 1; | |
if ($num_args != 4) | |
{ | |
print "Usage of merge_overlapping_exons.pl\n\n"; | |
print "perl merge_overlapping_exons.pl <input_gff> <output_gff> <outputdir> <input_fasta_file>\n"; | |
print "where <input_gff> is the input gff file,\n"; | |
print " <output_gff> is the output gff file,\n"; | |
print " <outputdir> is the directory to write output files in,\n"; | |
print " <input_fasta_file> is the input fasta file for the assembly\n"; | |
print "For example, >perl merge_overlapping_exons.pl final.gff final2.gff . TRIC.v1.fa\n"; | |
exit; | |
} | |
# FIND THE PATH TO THE INPUT GFF FILE: | |
my $input_gff = $ARGV[0]; | |
# FIND THE NAME OF THE OUTPUT GFF FILE: | |
my $output_gff = $ARGV[1]; | |
# FIND THE NAME OF THE DIRECTORY TO WRITE OUTPUT FILES IN: | |
my $outputdir = $ARGV[2]; | |
# FIND THE NAME OF THE INPUT FASTA FILE FOR THE ASSEMBLY: | |
my $input_fasta = $ARGV[3]; | |
#------------------------------------------------------------------# | |
# RUN THE MAIN PART OF THE CODE: | |
&run_main_program($input_gff,$output_gff,$outputdir,$input_fasta); | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# RUN THE MAIN PART OF THE CODE: | |
sub run_main_program | |
{ | |
my $input_gff = $_[0]; # THE INPUT GFF FILE | |
my $output_gff = $_[1]; # THE OUTPUT GFF FILE | |
my $outputdir = $_[2]; # THE DIRECTORY TO WRITE OUTPUT FILES IN | |
my $input_fasta = $_[3]; # THE INPUT ASSEMBLY FASTA FILE | |
my $input_fasta_obj = $_[4]; # OBJECT FOR THE INPUT FASTA FILE | |
my $contigs2len; # HASH TABLE OF THE LENGTHS OF SCAFFOLD/CONTIG SEQUENCES | |
my $contigslen_file; # FILE WITH THE LENGTHS OF SCAFFOLD/CONTIG SEQUENCES | |
my $contig; # A CONTIG/SCAFFOLD NAME | |
my $cds_gff; # GFF FILE OF 'CDS' FEATURES | |
my $feature_types; # FEATURE TYPES TO PUT IN $cds_gff | |
my $returnvalue; # RETURN VALUE FROM A FUNCTION | |
my $sorted_cds_gff; # A SORTED VERSION OF $cds_gff | |
my $exon_gff; # GFF FILE OF 'exon' FEATURES | |
my $sorted_exon_gff; # A SORTED VERSION OF $exon_gff | |
my $contiglen; # LENGTH OF A SCAFFOLD | |
# GET THE LENGTHS OF THE SEQUENCES IN THE INPUT FASTA FILE, AND WRITE THEM OUT IN A TEMPORARY FILE: | |
$input_fasta_obj = HelminthGenomeAnalysis::AvrilFastaUtils->new(fasta_file => $input_fasta); | |
$contigs2len = HelminthGenomeAnalysis::AvrilFastaUtils::build_contigs2len($input_fasta_obj); | |
$contigslen_file = HelminthGenomeAnalysis::AvrilFileUtils::make_filename($outputdir); # MAKE A TEMPORARY FILE IN THE CURRENT DIRECTORY | |
open(CONTIGSLENFILE,">$contigslen_file") || die "ERROR: run_main_program: cannot open $contigslen_file\n"; | |
foreach my $contig (keys %{$contigs2len}) | |
{ | |
$contiglen = $contigs2len->{$contig}; | |
# ADD ONE TO THE SCAFFOLD LENGTH, SO THAT add_flanking_region_to_gff_features FUNCTION WILL WORK, EVEN IF THE END OF AN EXON/CDS FEATURE | |
# IS THE LAST BASE OF THE SCAFFOLD: | |
$contiglen = $contiglen + 1; | |
print CONTIGSLENFILE "$contig\t$contiglen\n"; | |
} | |
close(CONTIGSLENFILE); | |
# GET A GFF FILE OF 'CDS' FEATURES: | |
$cds_gff = HelminthGenomeAnalysis::AvrilFileUtils::make_filename($outputdir); # MAKE A TEMPORARY FILE IN THE CURRENT DIRECTORY | |
$feature_types = ['CDS']; | |
$returnvalue = HelminthGenomeAnalysis::AvrilGffUtils::get_gff_lines_for_feature_types($input_gff,$feature_types,$cds_gff); | |
# SORT $cds_gff: | |
$sorted_cds_gff = HelminthGenomeAnalysis::AvrilFileUtils::make_filename($outputdir); | |
$returnvalue = HelminthGenomeAnalysis::AvrilGffUtils::sort_gff($cds_gff,$sorted_cds_gff,'CDS'); | |
# GET A GFF FILE OF 'exon' FEATURES: | |
$exon_gff = HelminthGenomeAnalysis::AvrilFileUtils::make_filename($outputdir); # MAKE A TEMPORARY FILE IN THE CURRENT DIRECTORY | |
$feature_types = ['exon']; | |
$returnvalue = HelminthGenomeAnalysis::AvrilGffUtils::get_gff_lines_for_feature_types($input_gff,$feature_types,$exon_gff); | |
# SORT $exon_gff: | |
$sorted_exon_gff = HelminthGenomeAnalysis::AvrilFileUtils::make_filename($outputdir); | |
$returnvalue = HelminthGenomeAnalysis::AvrilGffUtils::sort_gff($exon_gff,$sorted_exon_gff,'exon'); | |
# MERGE OVERLAPPING CDS FEATURES, OR FEATURES THAT HAVE A 0 bp INTRON BETWEEN THEM: | |
$returnvalue = HelminthGenomeAnalysis::AvrilGffUtils::merge_overlapping_cds($sorted_cds_gff,$sorted_exon_gff,$input_gff,$output_gff,$outputdir,$contigslen_file); | |
# DELETE TEMPORARY FILES: | |
system "rm -f $exon_gff"; | |
system "rm -f $cds_gff"; | |
system "rm -f $sorted_cds_gff"; | |
system "rm -f $sorted_exon_gff"; | |
system "rm -f $contigslen_file"; | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment