Last active
March 11, 2021 18:48
-
-
Save sakethramanujam/677c706da20c254f880f80ffef75888b to your computer and use it in GitHub Desktop.
Percy Metadata Downloader in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import os | |
import json | |
import pandas as pd | |
import requests | |
from tqdm import tqdm | |
import os | |
import json | |
import tempfile | |
from datetime import datetime as dt | |
STATS_URL = "https://mars.nasa.gov/rss/api/?feed=raw_images&category=mars2020&feedtype=json&latest=true" | |
tempdir = tempfile.gettempdir() | |
tempfp = os.path.join(tempdir, "percy_metadata_state.json") | |
existing_file_name = "" | |
def state_exists(path: str = tempfp): | |
""" | |
Checks if the percy_metadata_state file exists | |
in temporary directory | |
""" | |
if not os.path.isfile(path): | |
return False | |
return True | |
def create_state(path: str = tempfp): | |
""" | |
Creates a new percy_metadata_state.json file | |
in temporary directory | |
""" | |
template = { | |
"last_updated": "", | |
"n_scraped": "" | |
} | |
with open(path, "w") as f: | |
f.write(json.dumps(template)) | |
def read_state(path: str = tempfp): | |
""" | |
Reads new percy_metadata_state.json file | |
in temporary directory and gives out number of | |
pages that have already been scraped. | |
""" | |
try: | |
file = open(path, 'r') | |
data = json.load(file) | |
return data | |
except Exception as e: | |
print(f"Exception occured: {e}") | |
def update_state(update_dict: dict, path: str = tempfp): | |
""" | |
Takes in a dictionary with | |
new number of lines and the timestamp and, | |
updates percy_metadata_state.json file. | |
""" | |
with open(path, "w") as f: | |
f.write(json.dumps(update_dict)) | |
def give_me_time(): | |
""" | |
Creates a timestamp with current system time | |
used in metadata updation and filenames | |
""" | |
return dt.strftime(dt.now(), '%Y-%m-%d-%H_%M_%S') | |
def _checkfromfile(filepath: str): | |
""" | |
When there is no state information is available, | |
the script asks for filepath of previously downloaded | |
metadata csv file and calculates number of pages | |
that have already been scraped and creates a state file. | |
""" | |
if not os.path.isfile(filepath): | |
print("Oopsie, you sure the file path is right?") | |
return None | |
else: | |
df = pd.read_csv(filepath) | |
n = math.ceil(len(df)/50) | |
create_state() | |
ts = give_me_time() | |
update_state({"last_updated": ts, | |
"n_scraped": n}) | |
return n | |
def n_pages(where: str = "url"): | |
""" | |
Finds total number of available pages from | |
the NASA/JPL website. | |
Calls the check from file method in case we need to know | |
the number of pages that have already been scraped | |
""" | |
try: | |
if where == "url": | |
r = requests.get(STATS_URL) | |
stats = r.json() | |
n = math.ceil(stats["total"]/50) | |
return n | |
elif where == "file": | |
global existing_file_name | |
existing_file_name = input( | |
"path(\)filename of previously downloaded metadata: ") | |
n = _checkfromfile(existing_file_name) | |
if not n: | |
n_pages(where="file") | |
return n | |
except Exception as e: | |
print(f"An Error Occured in Finding Number of pages: {e}") | |
def get_image_list(url: str): | |
""" | |
Makes a get request to the images url | |
and extracts image list from it's response. | |
""" | |
try: | |
r = requests.get(url) | |
image_list = r.json()["images"] | |
return image_list | |
except Exception as e: | |
print(e) | |
def download_metadata(n_pages: int, filename: str = None): | |
""" | |
Once we have an idea about the number of pages | |
that are to be downloaded, this method visits | |
each of those pages and gets image list from the | |
get_image_list method and creates a pandas dataframe, | |
i.e., a table and saves it to a file | |
""" | |
dfs = [] | |
n = n_pages | |
progress_bar = tqdm(range(n)) | |
for page_num in progress_bar: | |
progress_bar.set_description( | |
"Downloading metadata from page: %d" % page_num) | |
url = f"https://mars.nasa.gov/rss/api/?feed=raw_images&category=mars2020&feedtype=json&num=50&page={page_num}&order=sol+desc&&&undefined" | |
if page_num > 1: | |
url = f"https://mars.nasa.gov/rss/api/?feed=raw_images&category=mars2020&feedtype=json&num=50&page={page_num}&order=sol+desc&&&extended=" | |
il = get_image_list(url) | |
dfs.append(pd.json_normalize(il, sep="_")) | |
df = pd.concat(dfs) | |
fn = filename if filename else f"./{give_me_time()}_metadata.csv" | |
df.reset_index(drop=True).to_csv(fn, index=False) | |
print(f"Metadata has been downloaded to {fn}") | |
return fn | |
def download_update(n_scraped: int, current: int): | |
""" | |
Technically, an update checker. | |
Gets total number of pages on the website and | |
current number pages from the state file. | |
Compares and calculates the number of pages to be downloaded | |
and calls metadata_downloader method. | |
""" | |
if current > n_scraped: | |
update_dict = {"last_updated": give_me_time(), | |
"n_scraped": current} | |
update_state(update_dict=update_dict) | |
print("Seems, Percy has some new data for you!, now downloading...") | |
to_download = current-n_scraped | |
patch_name = download_metadata(n_pages=to_download) | |
merge_with_existing(patch_name=patch_name) | |
elif current == n_scraped: | |
print("There's nothing new to download at this time, come back later!") | |
def downloader(where: str = "url"): | |
""" | |
This is where everything happens. | |
Calls n_pages with url saying, | |
that it needs context of total number of pages | |
Checks for state, incase state doesn't exist, | |
asks if data has already been downloaded. | |
If metadata is downloaded, creates state from file information, | |
proceeds to download patch only and asks if it needs to merge. | |
Else, downloads the entire metadata | |
# TODO: | |
- Port this script to percy-image-downloader package. | |
- Make state information, logging more robust and clear. | |
""" | |
total = n_pages(where="url") | |
current = n_pages(where=where) | |
state = state_exists() | |
if state: | |
n_scraped = read_state()["n_scraped"] | |
download_update(n_scraped=n_scraped, current=total) | |
else: | |
inp = (input("have you previously downloaded the metadata? (y/n): ")).lower() | |
if inp == "y": | |
downloader(where="file") | |
elif inp == "n": | |
print("Downloading all of the metadata now!") | |
download_metadata(n_pages=total, filename="./full-metadata.csv") | |
def merge_with_existing(patch_name: str): | |
decision = str(input( | |
"Do you wish to merge this file with existing full metadata file?: ")).lower() | |
if decision == "y": | |
global existing_file_name | |
if not existing_file_name: | |
existing_file_name = input( | |
"path(\)filename of previously downloaded metadata: ") | |
dfs = [pd.read_csv(patch_name), pd.read_csv(existing_file_name)] | |
pd.concat(dfs).reset_index(drop=True).to_csv( | |
existing_file_name, index=False) | |
print( | |
"The changes in metadata have been merged with the existing full metadata file.") | |
else: | |
print("Thank you Earthling, Have a nice sol!") | |
if __name__ == "__main__": | |
downloader() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
datetime | |
requests | |
pandas | |
tqdm |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hello again! I have slightly updated the script to allow for merging new downloaded patch with existing large file and added comments to each of the methods. Please let me know in case I need to write more granular comments.
I will be adding command line flags soon that will reduce the users interaction with script. (Arghhhhhh these course work!)
[EDIT]
I am currently feeling that this code is more sphagetti than it is organised, I will try to make it clean and readable at the earliest.