Last active
July 2, 2018 02:44
-
-
Save AnnaTSW0609/239d19da1f262b3f9adc63d947141491 to your computer and use it in GitHub Desktop.
Ensembl_Plant
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from config_hub import * | |
| DATA_SRC_SERVER = "myhost" # Have tried to change this to my host name but it still failed to run | |
| DATA_SRC_PORT = 27017 | |
| DATA_SRC_DATABASE = "tutorial_src" # not sure whether I need to change this to the Ensembl plant db we are using this time | |
| DATA_SRC_SERVER_USERNAME = None | |
| DATA_SRC_SERVER_PASSWORD = None | |
| DATA_TARGET_SERVER = "myhost" | |
| DATA_TARGET_PORT = 27017 | |
| DATA_TARGET_DATABASE = "tutorial" | |
| DATA_TARGET_SERVER_USERNAME = None | |
| DATA_TARGET_SERVER_PASSWORD = None | |
| # I have kept this unchanged as it seems to be the default port according to MongoDB documentation | |
| HUB_DB_BACKEND = { | |
| "module" : "biothings.utils.mongo", | |
| "uri" : "mongodb://myhost:27017", | |
| } | |
| # I have changed these to the paths that direct to files I would like to have my data in | |
| # but the same error appears | |
| DATA_ARCHIVE_ROOT = "/tmp/tutorial" | |
| LOG_FOLDER = "/tmp/tutorial/logs" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sys | |
| import os | |
| import time | |
| from ftplib import FTP | |
| import requests | |
| import biothings, config | |
| biothings.config_for_app(config) | |
| from biothings.utils.common import timesofar, safewfile, is_int | |
| from biothings.utils.hipchat import hipchat_msg | |
| from biothings.utils.hub_db import get_src_dump | |
| from biothings.utils.dataload import tab2list | |
| from config import DATA_ARCHIVE_ROOT, logger as logging | |
| from biothings.hub.dataload.dumper import HTTPDumper | |
| XML_QUERY_TEMPLATE_EXAMPLE = '''<?xml version="1.0" encoding="UTF-8"?> | |
| <!DOCTYPE Query> | |
| <Query virtualSchemaName = "plants_mart" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" > | |
| <Dataset name = "athaliana_eg_gene" interface = "default" > | |
| <Filter name = "biotype" value = "antisense_RNA"/> | |
| <Attribute name = "ensembl_gene_id" /> | |
| <Attribute name = "start_position" /> | |
| <Attribute name = "end_position" /> | |
| <Attribute name = "external_gene_name" /> | |
| <Attribute name = "chromosome_name" /> | |
| <Attribute name = "strand" /> | |
| <Attribute name = "description" /> | |
| <Attribute name = "gene_biotype" /> | |
| </Dataset> | |
| </Query> | |
| ''' | |
| XML_QUERY_TEMPLATE = '''<?xml version="1.0" encoding="UTF-8"?> | |
| <!DOCTYPE Query> | |
| <Query virtualSchemaName = "%(virtual_schema)s" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" > | |
| <Dataset name = "%(dataset)s" interface = "default" > | |
| %(filters)s | |
| %(attributes)s | |
| </Dataset> | |
| </Query> | |
| ''' | |
| class BioMart(HTTPDumper): | |
| SRC_NAME = "ensembl" | |
| SRC_ROOT_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, SRC_NAME) | |
| ENSEMBL_FTP_HOST = "ftp.plant.ensembl.org" | |
| MART_URL = "https://plants.ensembl.org/biomart/martservice" | |
| #MART_URL = "http://uswest.ensembl.org/biomart/martservice" | |
| TEMPLATE = XML_QUERY_TEMPLATE | |
| species_li = [] | |
| DUMP_METHOD = {"gene_emsembl_get_plant_gene.txt": "gene_emsembl_get_plant_gene"} | |
| SCHEDULE = "0 6 * * *" | |
| def gene_emsembl_get_plant_gene(self, outfile, debug=False): | |
| header = ['taxonomy_id', | |
| 'ensembl_gene_id', | |
| 'symbol', | |
| 'gene_chrom_start', 'gene_chrom_end', 'chr_name', 'chrom_strand', | |
| 'description','type_of_gene'] | |
| attributes = ["ensembl_gene_id", | |
| "external_gene_name", # symbols, called "external_gene_id" before release 76 | |
| "start_position", "end_position", "chromosome_name", "strand", | |
| "description","gene_biotype"] | |
| filters = ["Gene type: antisense_RNA"] | |
| self._fetch_data(outfile, attributes, header=header, debug=debug) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment