Created
October 27, 2023 12:55
-
-
Save NikosAlexandris/2ea12e74cc72f7eb1a326e8b9b903aef to your computer and use it in GitHub Desktop.
Checking chunk sizes [DRAFT]
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@app.command( | |
'check-chunks', | |
no_args_is_help=True, | |
help='Check for chunk size consistency along series of files in a format supported by Xarray', | |
) | |
def check_chunk_consistency( | |
source_directory: Annotated[Path, typer_argument_source_directory], | |
pattern: Annotated[str, typer_option_filename_pattern] = "*.json", | |
verbose: Annotated[int, typer.Option(..., "--verbose", "-v", count=True, help="Increase verbosity level.")] = 0, | |
): | |
""" """ | |
source_directory = Path(source_directory) | |
file_paths = list(source_directory.glob(pattern)) | |
files = list(map(str, file_paths)) | |
chunk_sizes = {} # dictionary to store chunk sizes of first file | |
for file in files: | |
with xr.open_dataset(file, engine="netcdf4") as dataset: | |
if not chunk_sizes: # populate with chunk sizes | |
for variable in dataset.variables: | |
if dataset[variable].encoding.get("chunksizes"): | |
chunk_sizes[variable] = dataset[variable].encoding["chunksizes"] | |
logger.debug(f'File : {file}, Chunks : {chunk_sizes}') | |
else: | |
# For subsequent files, check if chunk sizes match the initial ones | |
for variable in dataset.variables: | |
if ( | |
dataset[variable].encoding.get("chunksizes") | |
and chunk_sizes.get(variable) | |
!= dataset[variable].encoding["chunksizes"] | |
): | |
raise ValueError( | |
f"Chunk size mismatch in file '{file}' for variable '{variable}'. Expected {chunk_sizes[variable]} but got {dataset[variable].encoding['chunksizes']}" | |
) | |
else: | |
logger.debug(f'Variable : {variable}, Chunks : {dataset[variable].encoding["chunksizes"]}') | |
print("All files have consistent chunk sizes!") | |
import json | |
import logging | |
def get_chunk_sizes_from_json(file_path, variable): | |
try: | |
with open(file_path, 'r') as f: | |
data = json.load(f) | |
json_string = data['refs'].get(f'{variable}/.zarray') | |
if not json_string: | |
logger.warning(f"'{variable}/.zarray' not found in file {file_path}. Skipping...") | |
return {} | |
chunks_string = json.loads(json_string) | |
chunk_sizes = {variable: chunks_string.get("chunks")} | |
logger.info(f'File : {file_path}, Variable: {variable}, Chunk sizes: {chunk_sizes}') | |
return chunk_sizes | |
except Exception as e: | |
logger.error(f"Error processing file {file_path}: {e}") | |
return {} | |
def compare_chunk_sizes_json(file, variable, initial_chunk_sizes): | |
logger.info(f'Comparing file {file}') | |
current_chunk_sizes = get_chunk_sizes_from_json(file, variable) | |
mismatched_vars = [(variable, size) for variable, size in current_chunk_sizes.items() if initial_chunk_sizes.get(variable) != size] | |
if mismatched_vars: | |
var, size = mismatched_vars[0] | |
expected_size = initial_chunk_sizes[var] | |
logger.error(f"Chunk size mismatch in file {file} for variable {var}. Expected {expected_size} but got {size}") | |
return False | |
else: | |
# logger.info('Chunk sizes match!') | |
return True | |
@app.command( | |
'check-chunks-json', | |
no_args_is_help=True, | |
help='Check for chunk size consistency along series of kerchunk reference files', | |
) | |
def validate_chunk_sizes( | |
source_directory: Annotated[Path, typer_argument_source_directory], | |
variable: Annotated[str, typer.Argument(..., help='Variable name to select from')], | |
pattern: Annotated[str, typer_option_filename_pattern] = "*.json", | |
): | |
source_directory = Path(source_directory) | |
file_paths = list(source_directory.glob(pattern)) | |
files = list(map(str, file_paths)) | |
# Use as a comparison reference the chunk sizes from the first file | |
initial_chunk_sizes = get_chunk_sizes_from_json(files[0], variable) | |
if not initial_chunk_sizes: | |
logger.error(f"Cannot read chunk sizes from initial file {files[0]}. Exiting...") | |
return | |
all_match = True | |
for file in files[1:]: | |
if not compare_chunk_sizes_json(file, variable, initial_chunk_sizes): | |
all_match = False | |
if all_match: | |
logger.info("All files have consistent chunk sizes!") | |
print("All files have consistent chunk sizes!") | |
else: | |
logger.warning("Some files have inconsistent chunk sizes. Check the logs for details.") | |
print("Some files have inconsistent chunk sizes. Check the logs for details.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment