Created
February 12, 2009 17:50
-
-
Save brentp/62770 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/src/extended/gff3_out_stream.c b/src/extended/gff3_out_stream.c | |
index 6dbbc23..f813e8f 100644 | |
--- a/src/extended/gff3_out_stream.c | |
+++ b/src/extended/gff3_out_stream.c | |
@@ -18,6 +18,7 @@ | |
#include "extended/gff3_out_stream.h" | |
#include "extended/gff3_visitor.h" | |
#include "extended/node_stream_rep.h" | |
+#include "core/cstr_table.h" | |
struct GtGFF3OutStream { | |
const GtNodeStream parent_instance; | |
@@ -76,3 +77,11 @@ void gt_gff3_out_stream_set_fasta_width(GtNodeStream *gs, | |
gt_assert(gff3_out_stream); | |
gt_gff3_visitor_set_fasta_width(gff3_out_stream->gff3_visitor, fasta_width); | |
} | |
+ | |
+void gt_gff3_out_stream_retain_id_attributes(GtNodeStream *gs) | |
+{ | |
+ GtGFF3OutStream *gff3_out_stream = gff3_out_stream_cast(gs); | |
+ gt_assert(gff3_out_stream); | |
+ gt_gff3_visitor_retain_id_attributes(gff3_out_stream->gff3_visitor); | |
+ | |
+} | |
diff --git a/src/extended/gff3_out_stream.h b/src/extended/gff3_out_stream.h | |
index adc3eba..e004c5d 100644 | |
--- a/src/extended/gff3_out_stream.h | |
+++ b/src/extended/gff3_out_stream.h | |
@@ -28,5 +28,7 @@ const GtNodeStreamClass* gt_gff3_out_stream_class(void); | |
GtNodeStream* gt_gff3_out_stream_new(GtNodeStream*, GtGenFile*); | |
void gt_gff3_out_stream_set_fasta_width(GtNodeStream*, | |
unsigned long); | |
+void gt_gff3_out_stream_retain_id_attributes( | |
+ GtNodeStream *); | |
#endif | |
diff --git a/src/extended/gff3_visitor.c b/src/extended/gff3_visitor.c | |
index 8246680..1a26b72 100644 | |
--- a/src/extended/gff3_visitor.c | |
+++ b/src/extended/gff3_visitor.c | |
@@ -23,21 +23,27 @@ | |
#include "core/ma.h" | |
#include "core/unused_api.h" | |
#include "core/string_distri.h" | |
+#include "core/cstr_table.h" | |
+#include "core/str_api.h" | |
+#include "core/warning_api.h" | |
#include "extended/genome_node.h" | |
#include "extended/gff3_output.h" | |
#include "extended/gff3_parser.h" | |
#include "extended/gff3_visitor.h" | |
#include "extended/node_visitor_rep.h" | |
+#include <stdbool.h> | |
struct GtGFF3Visitor { | |
const GtNodeVisitor parent_instance; | |
bool version_string_shown, | |
+ retain_ids, | |
fasta_directive_shown; | |
GtStringDistri *id_counter; | |
GtHashmap *gt_feature_node_to_id_array, | |
*gt_feature_node_to_unique_id_str; | |
unsigned long fasta_width; | |
GtGenFile *outfp; | |
+ GtCstrTable *gt_used_ids; | |
}; | |
typedef struct { | |
@@ -71,6 +77,7 @@ static void gff3_visitor_free(GtNodeVisitor *gv) | |
gt_string_distri_delete(gff3_visitor->id_counter); | |
gt_hashmap_delete(gff3_visitor->gt_feature_node_to_id_array); | |
gt_hashmap_delete(gff3_visitor->gt_feature_node_to_unique_id_str); | |
+ gt_cstr_table_delete(gff3_visitor->gt_used_ids); | |
} | |
static int gff3_visitor_comment_node(GtNodeVisitor *gv, GtCommentNode *cn, | |
@@ -190,18 +197,66 @@ static GtStr* create_unique_id(GtGFF3Visitor *gff3_visitor, GtFeatureNode *gf) | |
return id; | |
} | |
+static void make_unique_id_string(GtStr *current_id, unsigned long counter) | |
+{ | |
+ /* name => name.1 */ | |
+ gt_str_append_char(current_id, '.'); | |
+ gt_str_append_ulong(current_id, counter); | |
+} | |
+ | |
+static bool id_string_is_unique(GtStr *id, GtStr *buf, GtCstrTable *tab, | |
+ unsigned long i) | |
+{ | |
+ gt_str_reset(buf); | |
+ gt_str_append_str(buf, id); | |
+ make_unique_id_string(buf, i); | |
+ return (gt_cstr_table_get(tab, gt_str_get(buf)) == NULL); | |
+} | |
+static void make_id_unique(GtGFF3Visitor *gff3_visitor, GtStr *id) | |
+{ | |
+ unsigned long i = 1; | |
+ | |
+ if (gt_cstr_table_get(gff3_visitor->gt_used_ids, gt_str_get(id))) | |
+ { | |
+ GtStr *buf = gt_str_new(); | |
+ while (!id_string_is_unique(id, buf, gff3_visitor->gt_used_ids, i)) | |
+ { | |
+ i++; | |
+ } | |
+ gt_warning("feature ID \"%s\" not unique: changing to %s", gt_str_get(id), | |
+ gt_str_get(buf)); | |
+ /* update table with the new id */ | |
+ gt_str_set(id, gt_str_get(buf)); | |
+ gt_str_delete(buf); | |
+ } | |
+ gt_cstr_table_add(gff3_visitor->gt_used_ids, gt_str_get(id)); | |
+} | |
+ | |
static int store_ids(GtGenomeNode *gn, void *data, GtError *err) | |
{ | |
GtGFF3Visitor *gff3_visitor = (GtGFF3Visitor*) data; | |
GtFeatureNode *gf = (GtFeatureNode*) gn; | |
AddIDInfo add_id_info; | |
int had_err = 0; | |
- GtStr *id; | |
+ bool has_id = false; | |
+ const char *id_string = gt_feature_node_get_attribute(gf, "ID"); | |
+ GtStr *id; /* = gt_str_new_cstr(id_string); */ | |
+ bool retain_ids = gff3_visitor->retain_ids; | |
gt_error_check(err); | |
gt_assert(gn && gf && gff3_visitor); | |
- if (gt_genome_node_has_children(gn) || gt_feature_node_is_multi(gf)) { | |
+ if (retain_ids && id_string) { | |
+ id = gt_str_new_cstr(id_string); | |
+ if (!gt_feature_node_is_multi(gf) || | |
+ (gt_feature_node_is_multi(gf) | |
+ && gt_feature_node_get_multi_representative(gf) == gf)) { | |
+ make_id_unique(gff3_visitor, id); | |
+ } | |
+ gt_hashmap_add(gff3_visitor->gt_feature_node_to_unique_id_str, gf, id); | |
+ has_id = true; | |
+ } | |
+ else if (gt_genome_node_has_children(gn) || gt_feature_node_is_multi(gf)) { | |
if (gt_feature_node_is_multi(gf)) { | |
id = gt_hashmap_get(gff3_visitor->gt_feature_node_to_unique_id_str, | |
gt_feature_node_get_multi_representative(gf)); | |
@@ -209,14 +264,18 @@ static int store_ids(GtGenomeNode *gn, void *data, GtError *err) | |
id = create_unique_id(gff3_visitor, | |
gt_feature_node_get_multi_representative(gf)); | |
} | |
+ | |
if (gt_feature_node_get_multi_representative(gf) != gf) { | |
gt_hashmap_add(gff3_visitor->gt_feature_node_to_unique_id_str, gf, | |
gt_str_ref(id)); | |
} | |
} | |
- else | |
+ else { | |
id = create_unique_id(gff3_visitor, gf); | |
- | |
+ } | |
+ has_id = true; | |
+ } | |
+ if (has_id) { | |
/* for each child -> store the parent feature in the hash map */ | |
add_id_info.gt_feature_node_to_id_array = | |
gff3_visitor->gt_feature_node_to_id_array, | |
@@ -224,6 +283,7 @@ static int store_ids(GtGenomeNode *gn, void *data, GtError *err) | |
had_err = gt_genome_node_traverse_direct_children(gn, &add_id_info, add_id, | |
err); | |
} | |
+ /* gt_str_delete(id); */ | |
return had_err; | |
} | |
@@ -329,9 +389,18 @@ GtNodeVisitor* gt_gff3_visitor_new(GtGenFile *outfp) | |
HASH_DIRECT, NULL, (GtFree) gt_str_delete); | |
gff3_visitor->fasta_width = 0; | |
gff3_visitor->outfp = outfp; | |
+ /* if retain_ids is set to true, hen gt_used_ids is .. used. */ | |
+ gff3_visitor->gt_used_ids = gt_cstr_table_new(); | |
+ gff3_visitor->retain_ids = false; | |
return gv; | |
} | |
+void gt_gff3_visitor_retain_id_attributes(GtNodeVisitor *gv) | |
+{ | |
+ GtGFF3Visitor *gff3_visitor = gff3_visitor_cast(gv); | |
+ gff3_visitor->retain_ids = true; | |
+} | |
+ | |
void gt_gff3_visitor_set_fasta_width(GtNodeVisitor *gv, | |
unsigned long fasta_width) | |
{ | |
diff --git a/src/extended/gff3_visitor.h b/src/extended/gff3_visitor.h | |
index 1939565..530371c 100644 | |
--- a/src/extended/gff3_visitor.h | |
+++ b/src/extended/gff3_visitor.h | |
@@ -27,5 +27,6 @@ const GtNodeVisitorClass* gt_gff3_visitor_class(void); | |
GtNodeVisitor* gt_gff3_visitor_new(GtGenFile*); | |
void gt_gff3_visitor_set_fasta_width(GtNodeVisitor*, | |
unsigned long); | |
+void gt_gff3_visitor_retain_id_attributes(GtNodeVisitor *); | |
#endif | |
diff --git a/src/tools/gt_gff3.c b/src/tools/gt_gff3.c | |
index 02e4e35..cc8f7ff 100644 | |
--- a/src/tools/gt_gff3.c | |
+++ b/src/tools/gt_gff3.c | |
@@ -35,6 +35,7 @@ | |
typedef struct { | |
bool sort, | |
checkids, | |
+ retainids, | |
mergefeat, | |
addintrons, | |
verbose, | |
@@ -91,6 +92,14 @@ static GtOptionParser* gt_gff3_option_parser_new(void *tool_arguments) | |
"parsing", &arguments->tidy, false); | |
gt_option_parser_add_option(op, option); | |
+ /* -retainids */ | |
+ option = gt_option_new_bool("retainids", | |
+ "when available, use the original IDs provided" | |
+ "in the source file\n" | |
+ "(memory consumption is O(file_size))", | |
+ &arguments->retainids, false); | |
+ gt_option_parser_add_option(op, option); | |
+ | |
/* -checkids */ | |
option = gt_option_new_bool("checkids", | |
"make sure the ID attributes are unique " | |
@@ -187,6 +196,7 @@ static int gt_gff3_runner(int argc, const char **argv, int parsed_args, | |
gt_gff3_in_stream_show_progress_bar((GtGFF3InStream*) gff3_in_stream); | |
if (arguments->checkids) | |
gt_gff3_in_stream_check_id_attributes((GtGFF3InStream*) gff3_in_stream); | |
+ | |
last_stream = gff3_in_stream; | |
/* set different type checker if necessary */ | |
@@ -243,6 +253,9 @@ static int gt_gff3_runner(int argc, const char **argv, int parsed_args, | |
gt_gff3_out_stream_set_fasta_width(gff3_out_stream, arguments->width); | |
} | |
+ if (!had_err && arguments->retainids) | |
+ gt_gff3_out_stream_retain_id_attributes(gff3_out_stream); | |
+ | |
/* pull the features through the stream and free them afterwards */ | |
if (!had_err) | |
had_err = gt_node_stream_pull(gff3_out_stream, err); | |
diff --git a/testdata/multi_feature_simple_retain.gff3 b/testdata/multi_feature_simple_retain.gff3 | |
new file mode 100644 | |
index 0000000..292dfff | |
--- /dev/null | |
+++ b/testdata/multi_feature_simple_retain.gff3 | |
@@ -0,0 +1,6 @@ | |
+##gff-version 3 | |
+##sequence-region ctg123 1 1497228 | |
+ctg123 . gene 1000 9000 . + . ID=gene1 | |
+ctg123 . CDS 1201 1500 . + 0 ID=CDS1;Parent=gene1 | |
+ctg123 . CDS 3000 3902 . + 0 ID=CDS1;Parent=gene1 | |
+### | |
diff --git a/testdata/png_test_2.gff3 b/testdata/png_test_2.gff3 | |
index 5217d18..2aae61d 100644 | |
--- a/testdata/png_test_2.gff3 | |
+++ b/testdata/png_test_2.gff3 | |
@@ -10,3 +10,4 @@ ctg123 . exon 1050 1500 . . . Parent=mRNA00001,mRNA00002 | |
ctg123 . exon 3000 3902 . . . Parent=mRNA00001,mRNA00003 | |
ctg123 . exon 5000 5500 . . . Parent=mRNA00001,mRNA00002,mRNA00003 | |
ctg123 . exon 7000 9000 . . . Parent=mRNA00001,mRNA00002,mRNA00003 | |
+### | |
diff --git a/testdata/png_test_2_out.gff3 b/testdata/png_test_2_out.gff3 | |
new file mode 100644 | |
index 0000000..0b20a6b | |
--- /dev/null | |
+++ b/testdata/png_test_2_out.gff3 | |
@@ -0,0 +1,13 @@ | |
+##gff-version 3 | |
+##sequence-region ctg123 1 10000 | |
+ctg123 . gene 1000 9000 . . . ID=gene00001 | |
+ctg123 . TF_binding_site 1000 1012 . . . Parent=gene00001 | |
+ctg123 . mRNA 1050 9000 . . . ID=mRNA00001;Parent=gene00001 | |
+ctg123 . mRNA 1050 9000 . . . ID=mRNA00002;Parent=gene00001 | |
+ctg123 . mRNA 1300 9000 . . . ID=mRNA00003;Parent=gene00001 | |
+ctg123 . exon 1050 1500 . . . Parent=mRNA00001,mRNA00002 | |
+ctg123 . exon 3000 3902 . . . Parent=mRNA00001,mRNA00003 | |
+ctg123 . exon 5000 5500 . . . Parent=mRNA00001,mRNA00002,mRNA00003 | |
+ctg123 . exon 7000 9000 . . . Parent=mRNA00001,mRNA00002,mRNA00003 | |
+ctg123 . exon 1300 1500 . . . Parent=mRNA00003 | |
+### | |
diff --git a/testsuite/gt_gff3_include.rb b/testsuite/gt_gff3_include.rb | |
index 3525dc2..9faa602 100644 | |
--- a/testsuite/gt_gff3_include.rb | |
+++ b/testsuite/gt_gff3_include.rb | |
@@ -612,6 +612,20 @@ Test do | |
run "diff #{$last_stdout} #{$testdata}two_fasta_seqs.gff3" | |
end | |
+Name "gt gff3 (-retainids)" | |
+Keywords "gt_gff3 retainids" | |
+Test do | |
+ run_test "#{$bin}gt gff3 -retainids #{$testdata}png_test_2.gff3" | |
+ run "diff #{$last_stdout} #{$testdata}png_test_2_out.gff3" | |
+end | |
+ | |
+Name "gt gff3 multi-feature (-retainids)" | |
+Keywords "gt_gff3 multi-feature retainids" | |
+Test do | |
+ run_test "#{$bin}gt gff3 -retainids #{$testdata}multi_feature_simple.gff3" | |
+ run "diff #{$last_stdout} #{$testdata}multi_feature_simple_retain.gff3" | |
+end | |
+ | |
Name "gt gff3 simple multi-feature (round-trip)" | |
Keywords "gt_gff3 multi-feature" | |
Test do |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment