Last active
February 21, 2023 12:06
-
-
Save carlosribas/b687b5624ba47eb1c68a0c084ea52efa to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Copyright [2009-present] EMBL-European Bioinformatics Institute | |
Licensed under the Apache License, Version 2.0 (the "License"); | |
you may not use this file except in compliance with the License. | |
You may obtain a copy of the License at | |
http://www.apache.org/licenses/LICENSE-2.0 | |
Unless required by applicable law or agreed to in writing, software | |
distributed under the License is distributed on an "AS IS" BASIS, | |
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
See the License for the specific language governing permissions and | |
limitations under the License. | |
Usage: python get_ids.py [database] | |
Example: python get_ids.py rfam | |
""" | |
import itertools | |
import json | |
import math | |
import requests # pip install requests | |
import sys | |
def get_query(database, start): | |
""" | |
Function to build the query | |
:param database: name of the Expert DB | |
:param start: index of the first entry to fetch | |
:return: query | |
""" | |
query = "?query=entry_type:metadata%20AND%20database:" \ | |
+ database \ | |
+ "&fields=job_id&size=100&start=" \ | |
+ str(start) \ | |
+ "&format=json" | |
return query | |
def get_results(database): | |
""" | |
Function to get the list of ids from a given database | |
:param database: name of the Expert DB | |
:return: results | |
""" | |
start = 0 | |
endpoint = "https://wwwdev.ebi.ac.uk/ebisearch/ws/rest/rnacentral-litscan" | |
query = get_query(database, start) | |
db_request = requests.get(endpoint + query) | |
hit_count = json.loads(db_request.text)["hitCount"] | |
results = [] | |
if hit_count > 0: | |
# get ids | |
get_result = json.loads(db_request.text) | |
entries = [get_result["entries"]] | |
results.append([item["fields"]["job_id"][0] for sublist in entries for item in sublist]) | |
# fetch other ids if needed | |
iter_number = int(math.ceil(hit_count / 100.0)) | |
for num in range(iter_number - 1): | |
start += 100 | |
query = get_query(database, start) | |
new_request = requests.get(endpoint + query) | |
new_request_result = json.loads(new_request.text) | |
new_entries = [new_request_result["entries"]] | |
results.append([item["fields"]["job_id"][0] for sublist in new_entries for item in sublist]) | |
return results | |
def main(): | |
database = None | |
if len(sys.argv) == 1: | |
print("You must specify the database") | |
exit() | |
elif len(sys.argv) == 2: | |
database = sys.argv[1] | |
else: | |
print("Usage: python get_ids.py rfam") | |
exit() | |
if database: | |
# get list of ids | |
results = get_results(database) | |
if results: | |
results = list(itertools.chain.from_iterable(results)) | |
results = sorted(set(results)) | |
# save results | |
with open(database + '_ids.txt', 'w') as f: | |
for item in results: | |
f.write(item + '\n') | |
else: | |
print("No id found for database {}".format(database)) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment