Created
April 4, 2024 10:59
-
-
Save lindenb/a20692a5f34d9da187cc834ad08c1896 to your computer and use it in GitHub Desktop.
Tool to Identify Gene, Regulatory Role, and Function at Integration Sites https://www.biostars.org/p/9591769/
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
BEGIN { | |
FS="\t"; | |
} | |
($3=="gene") { | |
gene_id=""; | |
gene_name="" | |
gene_biotype="" | |
N=split($9,a,/[ ]*[;][ ]*/); | |
for(i=1;i<=N;++i) { | |
N2 = split(a[i],b,/[ ]/); | |
K = b[1]; | |
V=b[2]; | |
gsub(/"/,"",V); | |
if(K=="gene_id") gene_id=V; | |
else if(K=="gene_name") gene_name=V; | |
else if(K=="gene_biotype") gene_biotype=V; | |
} | |
if(gene_id=="") next; | |
printf("<bio:Gene rdf:about=\"%s\">\n",gene_id); | |
printf("\t<bio:gene_id>%s</bio:gene_id>\n",gene_id); | |
if(gene_name!="") printf("\t<bio:gene_name>%s</bio:gene_name>\n",gene_name); | |
if(gene_biotype!="") printf("\t<bio:gene_biotype>%s</bio:gene_biotype>\n",gene_biotype); | |
printf("\t<bio:location>\n"); | |
printf("\t\t<bio:Location>\n"); | |
printf("\t\t\t<bio:build>%s</bio:build>\n",BUILD); | |
printf("\t\t\t<bio:chrom>%s</bio:chrom>\n",$1); | |
printf("\t\t\t<bio:start rdf:datatype=\"http://www.w3.org/2001/XMLSchema#int\">%s</bio:start>\n",$4); | |
printf("\t\t\t<bio:end rdf:datatype=\"http://www.w3.org/2001/XMLSchema#int\">%s</bio:end>\n",$5); | |
printf("\t\t</bio:Location>\n"); | |
printf("\t</bio:location>\n"); | |
printf("</bio:Gene>\n"); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SHELL=/bin/bash | |
OUTDIR=TMP | |
BUILD=GRCh38 | |
all: $(OUTDIR)/database.rdf query.01.sparql | |
/path/to/pache-jena-4.8.0/bin/arq --data=$< --query=query.01.sparql | |
$(OUTDIR)/database.rdf: $(OUTDIR)/go.rdf $(OUTDIR)/gtf.rdf | |
mkdir -p $(dir $@) | |
echo '<?xml version="1.0" encoding="UTF-8"?><rdf:RDF xmlns:bio="https://www.biostars.org/#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xml:base="https://www.biostars.org/">' > $@ | |
cat $^ >> $@ | |
echo "</rdf:RDF>" >> $@ | |
$(OUTDIR)/go.rdf: | |
mkdir -p $(dir $@) | |
wget -O - "https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz" | gunzip -c |\ | |
awk -F '\t' '$$1==9606' | cut -f 2,3,6 | sort -T $(dir $@) -t $$'\t' -k1,1 > $(addsuffix .tmp1,$@) | |
wget -O - "https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2ensembl.gz" | gunzip -c |\ | |
awk -F '\t' '$$1==9606' | cut -f 2,3 | sort -T $(dir $@) -t $$'\t' -k1,1 > $(addsuffix .tmp2,$@) | |
join -t $$'\t' -1 1 -2 1 $(addsuffix .tmp1,$@) $(addsuffix .tmp2,$@) > $(addsuffix .tmp3,$@) | |
cut -f 2,3 $(addsuffix .tmp3,$@) | sort -T $(dir $@) | uniq |\ | |
awk -F '\t' '{GO=$$1;gsub(/:/,"_",GO); printf("<bio:Term rdf:about=\"%s\"><bio:go_id>%s</bio:go_id><rdfs:label>%s</rdfs:label></bio:Term>\n",GO,$$1,$$2);}' >> $@ | |
cut -f 2,4 $(addsuffix .tmp3,$@) | awk -F '\t' '{GO=$$1;gsub(/:/,"_",GO); printf("<rdf:Description rdf:about=\"%s\"><bio:has_go_term rdf:resource=\"%s\"/></rdf:Description>\n",$$2,GO);}' >> $@ | |
rm $(addsuffix .tmp1,$@) $(addsuffix .tmp2,$@) $(addsuffix .tmp3,$@) | |
$(OUTDIR)/gtf.rdf : gtf2rdf.awk | |
mkdir -p $(dir $@) | |
wget -O - "https://ftp.ensembl.org/pub/release-111/gtf/homo_sapiens/Homo_sapiens.$(BUILD).111.chr.gtf.gz" | gunzip -c |\ | |
awk '($$1=="1")' |\ | |
awk -vBUILD=$(BUILD) -f gtf2rdf.awk > $@ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
PREFIX bio: <https://www.biostars.org/#> | |
PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#> | |
PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#> | |
SELECT | |
?build | |
?chrom | |
?start | |
?end | |
?gene_id | |
?gene_name | |
?gene_biotype | |
?go_id | |
?go_label | |
WHERE { | |
?gene bio:gene_name ?gene_name . | |
?gene bio:gene_biotype ?gene_biotype . | |
?gene bio:gene_id ?gene_id . | |
?gene bio:location ?loc . | |
?loc a bio:Location . | |
?loc bio:build ?build . | |
?loc bio:chrom ?chrom . | |
?loc bio:start ?start . | |
?loc bio:end ?end . | |
OPTIONAL { | |
?gene bio:has_go_term ?go . | |
?go bio:go_id ?go_id . | |
?go rdfs:label ?go_label . | |
} | |
FILTER( ?start <= 20746689 ) . | |
FILTER( ?end >= 20746689 ) . | |
FILTER( ?chrom = "1" ) . | |
} |
We can make this file beautiful and searchable if this error is corrected: No tabs found in this TSV file in line 0.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | |
| build | chrom | start | end | gene_id | gene_name | gene_biotype | go_id | go_label | | |
================================================================================================================================================================================================================================================ | |
| "GRCh38" | "1" | "20740266"^^<http://www.w3.org/2001/XMLSchema#int> | "20787323"^^<http://www.w3.org/2001/XMLSchema#int> | "ENSG00000127483" | "HP1BP3" | "protein_coding" | "GO:0006334" | "nucleosome assembly" | | |
| "GRCh38" | "1" | "20740266"^^<http://www.w3.org/2001/XMLSchema#int> | "20787323"^^<http://www.w3.org/2001/XMLSchema#int> | "ENSG00000127483" | "HP1BP3" | "protein_coding" | "GO:0000786" | "nucleosome" | | |
| "GRCh38" | "1" | "20740266"^^<http://www.w3.org/2001/XMLSchema#int> | "20787323"^^<http://www.w3.org/2001/XMLSchema#int> | "ENSG00000127483" | "HP1BP3" | "protein_coding" | "GO:0005634" | "nucleus" | | |
| "GRCh38" | "1" | "20740266"^^<http://www.w3.org/2001/XMLSchema#int> | "20787323"^^<http://www.w3.org/2001/XMLSchema#int> | "ENSG00000127483" | "HP1BP3" | "protein_coding" | "GO:0070828" | "heterochromatin organization" | | |
| "GRCh38" | "1" | "20740266"^^<http://www.w3.org/2001/XMLSchema#int> | "20787323"^^<http://www.w3.org/2001/XMLSchema#int> | "ENSG00000127483" | "HP1BP3" | "protein_coding" | "GO:0031491" | "nucleosome binding" | | |
| "GRCh38" | "1" | "20740266"^^<http://www.w3.org/2001/XMLSchema#int> | "20787323"^^<http://www.w3.org/2001/XMLSchema#int> | "ENSG00000127483" | "HP1BP3" | "protein_coding" | "GO:0006355" | "regulation of DNA-templated transcription" | | |
| "GRCh38" | "1" | "20740266"^^<http://www.w3.org/2001/XMLSchema#int> | "20787323"^^<http://www.w3.org/2001/XMLSchema#int> | "ENSG00000127483" | "HP1BP3" | "protein_coding" | "GO:0003677" | "DNA binding" | | |
| "GRCh38" | "1" | "20740266"^^<http://www.w3.org/2001/XMLSchema#int> | "20787323"^^<http://www.w3.org/2001/XMLSchema#int> | "ENSG00000127483" | "HP1BP3" | "protein_coding" | "GO:0005694" | "chromosome" | | |
| "GRCh38" | "1" | "20740266"^^<http://www.w3.org/2001/XMLSchema#int> | "20787323"^^<http://www.w3.org/2001/XMLSchema#int> | "ENSG00000127483" | "HP1BP3" | "protein_coding" | "GO:0016607" | "nuclear speck" | | |
| "GRCh38" | "1" | "20740266"^^<http://www.w3.org/2001/XMLSchema#int> | "20787323"^^<http://www.w3.org/2001/XMLSchema#int> | "ENSG00000127483" | "HP1BP3" | "protein_coding" | "GO:0042127" | "regulation of cell population proliferation" | | |
| "GRCh38" | "1" | "20740266"^^<http://www.w3.org/2001/XMLSchema#int> | "20787323"^^<http://www.w3.org/2001/XMLSchema#int> | "ENSG00000127483" | "HP1BP3" | "protein_coding" | "GO:0005515" | "protein binding" | | |
| "GRCh38" | "1" | "20740266"^^<http://www.w3.org/2001/XMLSchema#int> | "20787323"^^<http://www.w3.org/2001/XMLSchema#int> | "ENSG00000127483" | "HP1BP3" | "protein_coding" | "GO:0097298" | "regulation of nucleus size" | | |
| "GRCh38" | "1" | "20740266"^^<http://www.w3.org/2001/XMLSchema#int> | "20787323"^^<http://www.w3.org/2001/XMLSchema#int> | "ENSG00000127483" | "HP1BP3" | "protein_coding" | "GO:0071456" | "cellular response to hypoxia" | | |
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment