Created
February 10, 2009 05:18
-
-
Save brentp/61250 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/src/extended/gff3_out_stream.c b/src/extended/gff3_out_stream.c | |
index 6dbbc23..f813e8f 100644 | |
--- a/src/extended/gff3_out_stream.c | |
+++ b/src/extended/gff3_out_stream.c | |
@@ -18,6 +18,7 @@ | |
#include "extended/gff3_out_stream.h" | |
#include "extended/gff3_visitor.h" | |
#include "extended/node_stream_rep.h" | |
+#include "core/cstr_table.h" | |
struct GtGFF3OutStream { | |
const GtNodeStream parent_instance; | |
@@ -76,3 +77,11 @@ void gt_gff3_out_stream_set_fasta_width(GtNodeStream *gs, | |
gt_assert(gff3_out_stream); | |
gt_gff3_visitor_set_fasta_width(gff3_out_stream->gff3_visitor, fasta_width); | |
} | |
+ | |
+void gt_gff3_out_stream_retain_id_attributes(GtNodeStream *gs) | |
+{ | |
+ GtGFF3OutStream *gff3_out_stream = gff3_out_stream_cast(gs); | |
+ gt_assert(gff3_out_stream); | |
+ gt_gff3_visitor_retain_id_attributes(gff3_out_stream->gff3_visitor); | |
+ | |
+} | |
diff --git a/src/extended/gff3_out_stream.h b/src/extended/gff3_out_stream.h | |
index adc3eba..e004c5d 100644 | |
--- a/src/extended/gff3_out_stream.h | |
+++ b/src/extended/gff3_out_stream.h | |
@@ -28,5 +28,7 @@ const GtNodeStreamClass* gt_gff3_out_stream_class(void); | |
GtNodeStream* gt_gff3_out_stream_new(GtNodeStream*, GtGenFile*); | |
void gt_gff3_out_stream_set_fasta_width(GtNodeStream*, | |
unsigned long); | |
+void gt_gff3_out_stream_retain_id_attributes( | |
+ GtNodeStream *); | |
#endif | |
diff --git a/src/extended/gff3_visitor.c b/src/extended/gff3_visitor.c | |
index 8246680..02432f7 100644 | |
--- a/src/extended/gff3_visitor.c | |
+++ b/src/extended/gff3_visitor.c | |
@@ -23,21 +23,26 @@ | |
#include "core/ma.h" | |
#include "core/unused_api.h" | |
#include "core/string_distri.h" | |
+#include "core/cstr_table.h" | |
+#include "core/str_api.h" | |
#include "extended/genome_node.h" | |
#include "extended/gff3_output.h" | |
#include "extended/gff3_parser.h" | |
#include "extended/gff3_visitor.h" | |
#include "extended/node_visitor_rep.h" | |
+#include <stdbool.h> | |
struct GtGFF3Visitor { | |
const GtNodeVisitor parent_instance; | |
bool version_string_shown, | |
+ retain_ids, | |
fasta_directive_shown; | |
GtStringDistri *id_counter; | |
GtHashmap *gt_feature_node_to_id_array, | |
*gt_feature_node_to_unique_id_str; | |
unsigned long fasta_width; | |
GtGenFile *outfp; | |
+ GtCstrTable *gt_used_ids; | |
}; | |
typedef struct { | |
@@ -190,6 +195,30 @@ static GtStr* create_unique_id(GtGFF3Visitor *gff3_visitor, GtFeatureNode *gf) | |
return id; | |
} | |
+static void make_unique_id_string(GtStr *current_id, unsigned long counter) | |
+{ | |
+ /* name => name.1 */ | |
+ gt_str_append_char(current_id, '.'); | |
+ gt_str_append_ulong(current_id, counter); | |
+} | |
+ | |
+static void make_id_unique(GtGFF3Visitor *gff3_visitor, GtStr *id) | |
+{ | |
+ | |
+ GtCstrTable *used_ids = gff3_visitor->gt_used_ids; | |
+ unsigned long i = 0; | |
+ const char *id_string = gt_str_get(id); | |
+ | |
+ while ( gt_cstr_table_get(used_ids, gt_str_get(id) )) { | |
+ /* TODO: add warning */ | |
+ make_unique_id_string(id, ++i); | |
+ gt_str_set(id, id_string); | |
+ } | |
+ /* update table with the new id */ | |
+ gt_cstr_table_add(used_ids, gt_str_get(id)); | |
+ | |
+} | |
+ | |
static int store_ids(GtGenomeNode *gn, void *data, GtError *err) | |
{ | |
GtGFF3Visitor *gff3_visitor = (GtGFF3Visitor*) data; | |
@@ -197,32 +226,48 @@ static int store_ids(GtGenomeNode *gn, void *data, GtError *err) | |
AddIDInfo add_id_info; | |
int had_err = 0; | |
GtStr *id; | |
+ const char *id_string; | |
+ bool has_id = false; | |
gt_error_check(err); | |
gt_assert(gn && gf && gff3_visitor); | |
+ id_string = gt_feature_node_get_attribute(gf, "ID"); | |
- if (gt_genome_node_has_children(gn) || gt_feature_node_is_multi(gf)) { | |
- if (gt_feature_node_is_multi(gf)) { | |
- id = gt_hashmap_get(gff3_visitor->gt_feature_node_to_unique_id_str, | |
- gt_feature_node_get_multi_representative(gf)); | |
- if (!id) { /* the representative does not have its own id */ | |
- id = create_unique_id(gff3_visitor, | |
- gt_feature_node_get_multi_representative(gf)); | |
- } | |
+ if (id_string) { | |
+ id = gt_str_new_cstr(id_string); | |
+ has_id = true; | |
+ } | |
+ else { | |
+ /* no id, but it's a multi feature. can this even happen? */ | |
+ if (gt_feature_node_is_multi(gf) || gt_genome_node_has_children(gn) ) { | |
if (gt_feature_node_get_multi_representative(gf) != gf) { | |
- gt_hashmap_add(gff3_visitor->gt_feature_node_to_unique_id_str, gf, | |
- gt_str_ref(id)); | |
+ id = gt_hashmap_get(gff3_visitor->gt_feature_node_to_unique_id_str, | |
+ gt_feature_node_get_multi_representative(gf)); | |
+ has_id = true; | |
} | |
} | |
- else | |
- id = create_unique_id(gff3_visitor, gf); | |
+ /* it doesnt have an id and it is not a child-feature */ | |
+ else if ( ! gt_feature_node_get_attribute(gf, "Parent")) { | |
+ id = create_unique_id(gff3_visitor, gf); | |
+ has_id = true; | |
+ } | |
+ | |
+ } | |
+ if (has_id) { | |
+ if (gff3_visitor->retain_ids) { | |
+ make_id_unique(gff3_visitor, id); | |
+ } | |
+ gt_hashmap_add(gff3_visitor->gt_feature_node_to_unique_id_str, gf, | |
+ gt_str_ref(id)); | |
/* for each child -> store the parent feature in the hash map */ | |
add_id_info.gt_feature_node_to_id_array = | |
- gff3_visitor->gt_feature_node_to_id_array, | |
+ gff3_visitor->gt_feature_node_to_id_array, | |
add_id_info.id = gt_str_get(id); | |
had_err = gt_genome_node_traverse_direct_children(gn, &add_id_info, add_id, | |
err); | |
+ /* Q: needed? */ | |
+ gt_str_delete(id); | |
} | |
return had_err; | |
} | |
@@ -262,8 +307,9 @@ static int gff3_visitor_feature_node(GtNodeVisitor *gv, GtFeatureNode *fn, | |
/* show terminator, if the feature has children (otherwise it is clear that | |
the feature is complete, because no ID attribute has been shown) */ | |
- if (gt_genome_node_has_children((GtGenomeNode*) fn)) | |
- gt_genfile_xprintf(gff3_visitor->outfp, "%s\n", GFF_TERMINATOR); | |
+ /* if (gt_genome_node_has_children((GtGenomeNode*) fn)) */ | |
+ gt_genfile_xprintf(gff3_visitor->outfp, "%s\n", GFF_TERMINATOR); | |
+ /* printf("%s\n", gt_feature_node_get_attribute(fn, "ID")); */ | |
return had_err; | |
} | |
@@ -329,9 +375,18 @@ GtNodeVisitor* gt_gff3_visitor_new(GtGenFile *outfp) | |
HASH_DIRECT, NULL, (GtFree) gt_str_delete); | |
gff3_visitor->fasta_width = 0; | |
gff3_visitor->outfp = outfp; | |
+ /* if retain_ids is set to true, hen gt_used_ids is .. used. */ | |
+ gff3_visitor->gt_used_ids = gt_cstr_table_new(); | |
+ gff3_visitor->retain_ids = false; | |
return gv; | |
} | |
+void gt_gff3_visitor_retain_id_attributes(GtNodeVisitor *gv) | |
+{ | |
+ GtGFF3Visitor *gff3_visitor = gff3_visitor_cast(gv); | |
+ gff3_visitor->retain_ids = true; | |
+} | |
+ | |
void gt_gff3_visitor_set_fasta_width(GtNodeVisitor *gv, | |
unsigned long fasta_width) | |
{ | |
diff --git a/src/extended/gff3_visitor.h b/src/extended/gff3_visitor.h | |
index 1939565..530371c 100644 | |
--- a/src/extended/gff3_visitor.h | |
+++ b/src/extended/gff3_visitor.h | |
@@ -27,5 +27,6 @@ const GtNodeVisitorClass* gt_gff3_visitor_class(void); | |
GtNodeVisitor* gt_gff3_visitor_new(GtGenFile*); | |
void gt_gff3_visitor_set_fasta_width(GtNodeVisitor*, | |
unsigned long); | |
+void gt_gff3_visitor_retain_id_attributes(GtNodeVisitor *); | |
#endif | |
diff --git a/src/tools/gt_gff3.c b/src/tools/gt_gff3.c | |
index 74a1457..f498678 100644 | |
--- a/src/tools/gt_gff3.c | |
+++ b/src/tools/gt_gff3.c | |
@@ -35,6 +35,7 @@ | |
typedef struct { | |
bool sort, | |
checkids, | |
+ retainids, | |
mergefeat, | |
addintrons, | |
verbose, | |
@@ -91,6 +92,14 @@ static GtOptionParser* gt_gff3_option_parser_new(void *tool_arguments) | |
"parsing", &arguments->tidy, false); | |
gt_option_parser_add_option(op, option); | |
+ /* -retainids */ | |
+ option = gt_option_new_bool("retainids", | |
+ "when available, use the original IDs provided" | |
+ "in the source file\n" | |
+ "(memory consumption is O(file_size))", | |
+ &arguments->retainids, true); | |
+ gt_option_parser_add_option(op, option); | |
+ | |
/* -checkids */ | |
option = gt_option_new_bool("checkids", | |
"make sure the ID attributes are unique " | |
@@ -188,6 +197,7 @@ static int gt_gff3_runner(int argc, const char **argv, int parsed_args, | |
gt_gff3_in_stream_show_progress_bar((GtGFF3InStream*) gff3_in_stream); | |
if (arguments->checkids) | |
gt_gff3_in_stream_check_id_attributes((GtGFF3InStream*) gff3_in_stream); | |
+ | |
last_stream = gff3_in_stream; | |
/* set different type checker if necessary */ | |
@@ -244,6 +254,9 @@ static int gt_gff3_runner(int argc, const char **argv, int parsed_args, | |
gt_gff3_out_stream_set_fasta_width(gff3_out_stream, arguments->width); | |
} | |
+ if (!had_err && arguments->retainids) | |
+ gt_gff3_out_stream_retain_id_attributes(gff3_out_stream); | |
+ | |
/* pull the features through the stream and free them afterwards */ | |
if (!had_err) { | |
while (!(had_err = gt_node_stream_next(gff3_out_stream, &gn, err)) && |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment