Skip to content

Instantly share code, notes, and snippets.

@AnnaTSW0609
Last active July 2, 2018 02:44
Show Gist options
  • Save AnnaTSW0609/239d19da1f262b3f9adc63d947141491 to your computer and use it in GitHub Desktop.
Save AnnaTSW0609/239d19da1f262b3f9adc63d947141491 to your computer and use it in GitHub Desktop.
Ensembl_Plant
from config_hub import *
DATA_SRC_SERVER = "myhost" # Have tried to change this to my host name but it still failed to run
DATA_SRC_PORT = 27017
DATA_SRC_DATABASE = "tutorial_src" # not sure whether I need to change this to the Ensembl plant db we are using this time
DATA_SRC_SERVER_USERNAME = None
DATA_SRC_SERVER_PASSWORD = None
DATA_TARGET_SERVER = "myhost"
DATA_TARGET_PORT = 27017
DATA_TARGET_DATABASE = "tutorial"
DATA_TARGET_SERVER_USERNAME = None
DATA_TARGET_SERVER_PASSWORD = None
# I have kept this unchanged as it seems to be the default port according to MongoDB documentation
HUB_DB_BACKEND = {
"module" : "biothings.utils.mongo",
"uri" : "mongodb://myhost:27017",
}
# I have changed these to the paths that direct to files I would like to have my data in
# but the same error appears
DATA_ARCHIVE_ROOT = "/tmp/tutorial"
LOG_FOLDER = "/tmp/tutorial/logs"
import sys
import os
import time
from ftplib import FTP
import requests
import biothings, config
biothings.config_for_app(config)
from biothings.utils.common import timesofar, safewfile, is_int
from biothings.utils.hipchat import hipchat_msg
from biothings.utils.hub_db import get_src_dump
from biothings.utils.dataload import tab2list
from config import DATA_ARCHIVE_ROOT, logger as logging
from biothings.hub.dataload.dumper import HTTPDumper
XML_QUERY_TEMPLATE_EXAMPLE = '''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE Query>
<Query virtualSchemaName = "plants_mart" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >
<Dataset name = "athaliana_eg_gene" interface = "default" >
<Filter name = "biotype" value = "antisense_RNA"/>
<Attribute name = "ensembl_gene_id" />
<Attribute name = "start_position" />
<Attribute name = "end_position" />
<Attribute name = "external_gene_name" />
<Attribute name = "chromosome_name" />
<Attribute name = "strand" />
<Attribute name = "description" />
<Attribute name = "gene_biotype" />
</Dataset>
</Query>
'''
XML_QUERY_TEMPLATE = '''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE Query>
<Query virtualSchemaName = "%(virtual_schema)s" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
<Dataset name = "%(dataset)s" interface = "default" >
%(filters)s
%(attributes)s
</Dataset>
</Query>
'''
class BioMart(HTTPDumper):
SRC_NAME = "ensembl"
SRC_ROOT_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, SRC_NAME)
ENSEMBL_FTP_HOST = "ftp.plant.ensembl.org"
MART_URL = "https://plants.ensembl.org/biomart/martservice"
#MART_URL = "http://uswest.ensembl.org/biomart/martservice"
TEMPLATE = XML_QUERY_TEMPLATE
species_li = []
DUMP_METHOD = {"gene_emsembl_get_plant_gene.txt": "gene_emsembl_get_plant_gene"}
SCHEDULE = "0 6 * * *"
def gene_emsembl_get_plant_gene(self, outfile, debug=False):
header = ['taxonomy_id',
'ensembl_gene_id',
'symbol',
'gene_chrom_start', 'gene_chrom_end', 'chr_name', 'chrom_strand',
'description','type_of_gene']
attributes = ["ensembl_gene_id",
"external_gene_name", # symbols, called "external_gene_id" before release 76
"start_position", "end_position", "chromosome_name", "strand",
"description","gene_biotype"]
filters = ["Gene type: antisense_RNA"]
self._fetch_data(outfile, attributes, header=header, debug=debug)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment