Skip to content

Instantly share code, notes, and snippets.

@nocollier
Last active September 6, 2023 14:26
Show Gist options
  • Save nocollier/2090d8de514a41631fafcd0199307dd9 to your computer and use it in GitHub Desktop.
Save nocollier/2090d8de514a41631fafcd0199307dd9 to your computer and use it in GitHub Desktop.
ESGF Globus query
import re
import pandas as pd
from globus_sdk import SearchClient
from globus_sdk.response import GlobusHTTPResponse
# the Globus uuid of the test ESGF2 Argonne index
INDEX_ID = "d927e2d9-ccdb-48e4-b05d-adbc3d97bbc5"
def response_to_dataframe(response: GlobusHTTPResponse, pattern: str) -> pd.DataFrame:
"""Return the dataset entries from the response of a Globus search."""
df = []
for g in response.get("gmeta"):
assert len(g["entries"]) == 1 # A check on the assumption of a single entry
if g["entries"][0]["entry_id"] != "dataset": # Manually remove files
continue
m = re.search(pattern, g["subject"])
if m:
df.append(m.groupdict())
df = pd.DataFrame(df)
return df
def get_dataset_pattern() -> str:
"""Return the Globus subject re pattern for datasets."""
COLUMNS = [
"mip_era",
"activity_id",
"institution_id",
"source_id",
"experiment_id",
"member_id",
"table_id",
"variable_id",
"grid_label",
"version",
"data_node",
]
pattern = "\.".join([f"(?P<{c}>\S[^.|]+)" for c in COLUMNS[:-1]])
pattern += f"\|(?P<{COLUMNS[-1]}>\S+)"
return pattern
# specify a search like we are used to
SEARCH = {
"activity_id": "CMIP",
"experiment_id": "historical",
"source_id": "CESM2",
"variable_id": "tas",
"member_id": "r1i1p1f1",
}
# try using SearchClient.search()
query = 'type: "Dataset" '
query += " ".join(f'{key}: "{val}"' for (key, val) in SEARCH.items())
query = query.replace('" ', '" AND ')
client = SearchClient()
result = client.search(INDEX_ID, query, limit=100, advanced=True)
df = response_to_dataframe(result, get_dataset_pattern())
# what did I get out of this search?
print(query)
print(f"{len(df)=}")
print(f"{len(result.get('gmeta'))=}")
print(f"{result['total']=}")
for key in SEARCH.keys():
if key == "type":
continue
print(f"{key}: {df[key].unique()}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment