Created
March 7, 2019 18:04
-
-
Save bewt85/a749a30c7d46ed7c2995eafc094bc10f to your computer and use it in GitHub Desktop.
Some code from a project downloading data from ENA
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import hashlib | |
import json | |
import logging | |
import os | |
import re | |
import requests | |
import subprocess | |
import sys | |
import time | |
import urllib | |
import pandas as pd | |
from collections import OrderedDict | |
logger = logging.getLogger(__name__) | |
def get_batches(df, count): | |
for i in range(0, df.shape[0], count): | |
yield df.iloc[i:i+count] | |
def get_chunks(g, size): | |
chunk = list(islice(g, size)) | |
while chunk: | |
yield chunk | |
chunk = list(islice(g, size)) | |
fields = OrderedDict([ | |
('study_accession', 'Study ID'), # study accession number | |
# ('secondary_study_accession', 'secondary_study_accession'), # secondary study accession number | |
('sample_accession', 'Sample ID'), # sample accession number | |
# ('secondary_sample_accession', 'secondary_sample_accession'), # secondary sample accession number | |
('experiment_accession', 'Experiment ID'), # experiment accession number | |
('run_accession', 'Run ID'), # run accession number | |
# ('submission_accession', 'submission_accession'), # submission accession number | |
('tax_id', 'Submitted taxid'), # taxonomic ID | |
('scientific_name', 'Submitted scientific name'), # scientific name | |
# ('instrument_platform', 'instrument_platform'), # instrument platform used in sequencing experiment | |
# ('instrument_model', 'instrument_model'), # instrument model used in sequencing experiment | |
# ('library_name', 'library_name'), # sequencing library name | |
('center_name', 'Submitting center'), # Submitting center | |
('first_public', 'Reads made public'), # date when made public | |
('last_updated', 'Reads updated'), # date when last updated | |
('experiment_title', 'Experiment'), # brief experiment title | |
('study_title', 'Study'), # brief sequencing study description | |
# ('study_alias', 'study_alias'), # submitter's name for the study | |
# ('experiment_alias', 'experiment_alias'), # submitter's name for the experiment | |
# ('run_alias', 'run_alias'), # submitter's name for the run | |
# ('sample_alias', 'sample_alias'), # submitter's name for the sample | |
('sample_title', 'Sample'), # brief sample title | |
('first_created', 'Reads uploaded'), # date when first created | |
('sample_description', 'Sample description'), # detailed sample description | |
('strain', 'Submitted strain'), # strain from which sample was obtained | |
('serovar', 'Submitted serovar'), # serological variety of a species (usually a prokaryote) characterized by its antigenic properties | |
# ('sex', 'sex'), # sex of the organism from which the sample was obtained | |
# ('submitted_sex', 'submitted_sex'), # sex of the organism from which the sample was obtained | |
# ('dev_stage', 'dev_stage'), # sample obtained from an organism in a specific developmental stage | |
# ('tissue_type', 'tissue_type'), # tissue type from which the sample was obtained | |
('isolation_source', 'Isolation source'), # describes the physical, environmental and/or local geographical source of the sample | |
('isolate', 'Isolate'), # individual isolate from which sample was obtained | |
# ('host_tax_id', 'Host taxid'), # NCBI taxon id of the host | |
('host_scientific_name', 'Host'), # Scientific name of the natural (as opposed to laboratory) host to the organism from which sample was obtained | |
# ('host_common_name', 'host_common_name'), # common name of the host | |
('host_status', 'Host status'), # condition of host (eg. diseased or healthy) | |
# ('host_sex', 'host_sex'), # physical sex of the host | |
# ('submitted_host_sex', 'submitted_host_sex'), # physical sex of the host | |
('host_subject_id', 'Host ID'), # a unique identifier by which each subject can be referred to, de-identified | |
('collection_date', 'Collection date'), # date that the specimen was collected | |
('collected_by', 'Collected By'), # name of the person who collected the specimen | |
('collecting_institute', 'Collecting institute'), # Name of the institution to which the person collecting the specimen belongs. Format: Institute Name, Institute Address | |
('country', 'Country'), # locality of sample isolation: country names, oceans or seas, followed by regions and localities | |
('region', 'Region'), # geographical origin of the sample as defined by the specific region name followed by the locality name | |
('location', 'Location'), # geographic location of isolation of the sample | |
# ('environmental_sample', 'environmental_sample'), # identifies sequences derived by direct molecular isolation from an environmental DNA sample | |
# ('investigation_type', 'investigation_type'), # the study type targeted by the sequencing | |
('receipt_date', 'Sample receipt data'), # Date on which the sample was received | |
# ('sampling_site', 'sampling_site'), # the site/station where this sample was collection | |
]) | |
def get_raw_metdata(): | |
s = requests.Session() | |
LIMIT = 1000 | |
offset = 0 | |
projects = [ | |
'study_accession="PRJNA248792"' | |
] | |
payload = { | |
'result': 'read_run', | |
'fields': '%2C'.join(fields.keys()), | |
'limit': LIMIT, | |
'dataPortal': 'pathogen', | |
'format': 'json', | |
'sortFields': 'run_accession' | |
} | |
for project in projects: | |
offset = 0 | |
payload['query'] = urllib.parse.quote_plus(project) | |
while True: | |
payload['offset'] = str(offset) | |
r = s.post( | |
'https://www.ebi.ac.uk/ena/portal/api/search', | |
data=payload | |
) | |
if r.status_code == 204: | |
break | |
r.raise_for_status() | |
logger.debug(f"{len(r.json())} new results found at offset {offset}") | |
for el in r.json(): | |
yield [el.get(f, '') for f in fields.keys()] | |
offset += LIMIT | |
time.sleep(1) | |
def set_dates(df): | |
date_fields = [ | |
'collection_date', # date that the specimen was collected | |
'receipt_date', # Date on which the sample was received | |
'first_created', # date when first created | |
'first_public', # date when made public | |
'last_updated', # date when last updated | |
] | |
def date(row): | |
for f in date_fields: | |
try: | |
if row[f]: | |
(year, month, day) = row[f].split('-') | |
return pd.Series([int(year), int(month), int(day), f]) | |
except: | |
continue | |
return pd.Series(['', '', '', 'missing']) | |
df[['year', 'month', 'day', 'date_field']] = df.aggregate(date, axis=1) | |
df.drop(columns=date_fields, inplace=True) | |
def collect_metadata(): | |
CHUNK_SIZE=500 | |
for chunk in get_chunks(get_raw_metdata(), CHUNK_SIZE): | |
metadata = pd.DataFrame(columns=fields.keys(), data=chunk) | |
metadata['displayname'] = metadata.aggregate(lambda r: r.get('isolate') if r.get('isolate') else r['run_accession'], axis=1) | |
metadata['filename'] = metadata['run_accession'] + '.fa' | |
metadata['Assembled by'] = 'NCBI' | |
set_dates(metadata) | |
yield metadata |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment