AnnaTSW0609 · July 2, 2018 02:44
diff --git a/config.py b/config.py
 from config_hub import *


 DATA_SRC_SERVER = "myhost" # Have tried to change this to my host name but it still failed to run
 DATA_SRC_PORT = 27017
 DATA_SRC_DATABASE = "tutorial_src" # not sure whether I need to change this to the Ensembl plant db we are using this time
 DATA_SRC_SERVER_USERNAME = None
 DATA_SRC_SERVER_PASSWORD = None

 DATA_TARGET_SERVER = "myhost"
 DATA_TARGET_PORT = 27017
 DATA_TARGET_DATABASE = "tutorial"
 DATA_TARGET_SERVER_USERNAME = None
 DATA_TARGET_SERVER_PASSWORD = None

 # I have kept this unchanged as it seems to be the default port according to MongoDB documentation 
 HUB_DB_BACKEND = {
 		"module" : "biothings.utils.mongo",
 		"uri" : "mongodb://myhost:27017",
 		}

 # I have changed these to the paths that direct to files I would like to have my data in
 # but the same error appears 
 DATA_ARCHIVE_ROOT = "/tmp/tutorial" 
 LOG_FOLDER = "/tmp/tutorial/logs" 
diff --git a/Plant_Dumper_1st_draft.py b/Plant_Dumper_1st_draft.py
 import sys
 import os
 import time
 from ftplib import FTP
 import requests

 import biothings, config
 biothings.config_for_app(config)

 from biothings.utils.common import timesofar, safewfile, is_int
 from biothings.utils.hipchat import hipchat_msg
 from biothings.utils.hub_db import get_src_dump
 from biothings.utils.dataload import tab2list
 from config import DATA_ARCHIVE_ROOT, logger as logging
 from biothings.hub.dataload.dumper import HTTPDumper

 XML_QUERY_TEMPLATE_EXAMPLE = '''<?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE Query>
 <Query  virtualSchemaName = "plants_mart" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >
 			
 	<Dataset name = "athaliana_eg_gene" interface = "default" >
 		<Filter name = "biotype" value = "antisense_RNA"/>
 		<Attribute name = "ensembl_gene_id" />
 		<Attribute name = "start_position" />
 		<Attribute name = "end_position" />
 		<Attribute name = "external_gene_name" />
 		<Attribute name = "chromosome_name" />
 		<Attribute name = "strand" />
 		<Attribute name = "description" />
 		<Attribute name = "gene_biotype" />
 	</Dataset>
 </Query>
 '''

 XML_QUERY_TEMPLATE = '''<?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE Query>
 <Query  virtualSchemaName = "%(virtual_schema)s" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
    <Dataset name = "%(dataset)s" interface = "default" >
        %(filters)s
        %(attributes)s
    </Dataset>
 </Query>
 '''

 class BioMart(HTTPDumper):

    SRC_NAME = "ensembl"
    SRC_ROOT_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, SRC_NAME)

    ENSEMBL_FTP_HOST = "ftp.plant.ensembl.org"
    MART_URL = "https://plants.ensembl.org/biomart/martservice"
    #MART_URL = "http://uswest.ensembl.org/biomart/martservice"
    TEMPLATE = XML_QUERY_TEMPLATE
    species_li = []
    DUMP_METHOD = {"gene_emsembl_get_plant_gene.txt": "gene_emsembl_get_plant_gene"}

    SCHEDULE = "0 6 * * *"
    
    def gene_emsembl_get_plant_gene(self, outfile, debug=False):
        header = ['taxonomy_id',
                  'ensembl_gene_id',
                  'symbol',
                  'gene_chrom_start', 'gene_chrom_end', 'chr_name', 'chrom_strand',
                  'description','type_of_gene']
        attributes = ["ensembl_gene_id",
                      "external_gene_name",   # symbols, called "external_gene_id" before release 76
                      "start_position", "end_position", "chromosome_name", "strand",
                      "description","gene_biotype"]
        filters = ["Gene type: antisense_RNA"]
        self._fetch_data(outfile, attributes, header=header, debug=debug)
	from config_hub import *


	DATA_SRC_SERVER = "myhost" # Have tried to change this to my host name but it still failed to run
	DATA_SRC_PORT = 27017
	DATA_SRC_DATABASE = "tutorial_src" # not sure whether I need to change this to the Ensembl plant db we are using this time
	DATA_SRC_SERVER_USERNAME = None
	DATA_SRC_SERVER_PASSWORD = None

	DATA_TARGET_SERVER = "myhost"
	DATA_TARGET_PORT = 27017
	DATA_TARGET_DATABASE = "tutorial"
	DATA_TARGET_SERVER_USERNAME = None
	DATA_TARGET_SERVER_PASSWORD = None

	# I have kept this unchanged as it seems to be the default port according to MongoDB documentation
	HUB_DB_BACKEND = {
	"module" : "biothings.utils.mongo",
	"uri" : "mongodb://myhost:27017",
	}

	# I have changed these to the paths that direct to files I would like to have my data in
	# but the same error appears
	DATA_ARCHIVE_ROOT = "/tmp/tutorial"
	LOG_FOLDER = "/tmp/tutorial/logs"
	import sys
	import os
	import time
	from ftplib import FTP
	import requests

	import biothings, config
	biothings.config_for_app(config)

	from biothings.utils.common import timesofar, safewfile, is_int
	from biothings.utils.hipchat import hipchat_msg
	from biothings.utils.hub_db import get_src_dump
	from biothings.utils.dataload import tab2list
	from config import DATA_ARCHIVE_ROOT, logger as logging
	from biothings.hub.dataload.dumper import HTTPDumper

	XML_QUERY_TEMPLATE_EXAMPLE = '''<?xml version="1.0" encoding="UTF-8"?>
	<!DOCTYPE Query>
	<Query virtualSchemaName = "plants_mart" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >

	<Dataset name = "athaliana_eg_gene" interface = "default" >
	<Filter name = "biotype" value = "antisense_RNA"/>
	<Attribute name = "ensembl_gene_id" />
	<Attribute name = "start_position" />
	<Attribute name = "end_position" />
	<Attribute name = "external_gene_name" />
	<Attribute name = "chromosome_name" />
	<Attribute name = "strand" />
	<Attribute name = "description" />
	<Attribute name = "gene_biotype" />
	</Dataset>
	</Query>
	'''

	XML_QUERY_TEMPLATE = '''<?xml version="1.0" encoding="UTF-8"?>
	<!DOCTYPE Query>
	<Query virtualSchemaName = "%(virtual_schema)s" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
	<Dataset name = "%(dataset)s" interface = "default" >
	%(filters)s
	%(attributes)s
	</Dataset>
	</Query>
	'''

	class BioMart(HTTPDumper):

	SRC_NAME = "ensembl"
	SRC_ROOT_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, SRC_NAME)

	ENSEMBL_FTP_HOST = "ftp.plant.ensembl.org"
	MART_URL = "https://plants.ensembl.org/biomart/martservice"
	#MART_URL = "http://uswest.ensembl.org/biomart/martservice"
	TEMPLATE = XML_QUERY_TEMPLATE
	species_li = []
	DUMP_METHOD = {"gene_emsembl_get_plant_gene.txt": "gene_emsembl_get_plant_gene"}

	SCHEDULE = "0 6 * * *"

	def gene_emsembl_get_plant_gene(self, outfile, debug=False):
	header = ['taxonomy_id',
	'ensembl_gene_id',
	'symbol',
	'gene_chrom_start', 'gene_chrom_end', 'chr_name', 'chrom_strand',
	'description','type_of_gene']
	attributes = ["ensembl_gene_id",
	"external_gene_name", # symbols, called "external_gene_id" before release 76
	"start_position", "end_position", "chromosome_name", "strand",
	"description","gene_biotype"]
	filters = ["Gene type: antisense_RNA"]
	self._fetch_data(outfile, attributes, header=header, debug=debug)