Let us use the scipy.io
library to read MATLAB files and pandas
with pyarrow
to handle Parquet file conversion.
Here's a comprehensive example:
# First, install required packages if you haven't already
# pip install scipy pandas pyarrow numpy
import scipy.io
import pandas as pd
import numpy as np
def read_matlab_to_dataframe(mat_file_path):
try:
# Read the .mat file
mat_data = scipy.io.loadmat(mat_file_path)
# Remove system variables that start with '__'
data = {key: value for key, value in mat_data.items()
if not key.startswith('__')}
# Convert the data to a pandas DataFrame
# This handles different types of MATLAB data structures
dataframes = {}
for key, value in data.items():
try:
# If it's a numeric array
if isinstance(value, np.ndarray):
# Handle different dimensions
if len(value.shape) == 1:
# 1D array
df = pd.DataFrame(value, columns=[key])
elif len(value.shape) == 2:
# 2D array
df = pd.DataFrame(value,
columns=[f"{key}_{i}" for i in range(value.shape[1])])
else:
print(f"Skipping {key}: Higher dimensional array not converted to DataFrame")
continue
dataframes[key] = df
except Exception as e:
print(f"Error processing {key}: {str(e)}")
continue
return dataframes
except Exception as e:
print(f"Error reading MATLAB file: {str(e)}")
return None
def save_to_parquet(dataframes, output_path):
try:
# If there's only one DataFrame, save it directly
if len(dataframes) == 1:
df = list(dataframes.values())[0]
df.to_parquet(output_path)
print(f"Saved DataFrame to {output_path}")
else:
# If multiple DataFrames, save each with a numbered suffix
base_path = output_path.rsplit('.', 1)[0]
for i, (name, df) in enumerate(dataframes.items()):
file_path = f"{base_path}_{i}_{name}.parquet"
df.to_parquet(file_path)
print(f"Saved DataFrame '{name}' to {file_path}")
except Exception as e:
print(f"Error saving to Parquet: {str(e)}")
# Example usage
if __name__ == "__main__":
# Specify your file paths
mat_file_path = "your_data.mat" # Replace with your .mat file path
parquet_output_path = "output.parquet" # Replace with desired output path
# Read the MATLAB file
dataframes = read_matlab_to_dataframe(mat_file_path)
if dataframes:
# Print information about the data
print("\nFound the following datasets:")
for name, df in dataframes.items():
print(f"\nDataset: {name}")
print(f"Shape: {df.shape}")
print("First few rows:")
print(df.head())
# Save to Parquet
save_to_parquet(dataframes, parquet_output_path)
This script provides:
- A function to read MATLAB files using
scipy.io.loadmat
- Conversion of MATLAB data to pandas DataFrames
- Handling of different data shapes (1D and 2D arrays)
- Saving the results to Parquet format using pandas and pyarrow
- Error handling for various potential issues
To use this script:
- First, install the required packages:
pip install scipy pandas pyarrow numpy
- Replace the file paths in the example usage section:
mat_file_path
: Path to your .mat fileparquet_output_path
: Where you want to save the Parquet file
Important notes:
-
MATLAB Data Structures: This script handles basic numeric arrays. If your .mat file contains complex structures, cell arrays, or other MATLAB-specific data types, you might need to modify the script accordingly.
-
Large Files: For very large MATLAB files, you might want to add memory management considerations.
-
Parquet Benefits: Saving to Parquet format provides:
- Efficient compression
- Columnar storage
- Better performance for data analysis
- Cross-platform compatibility
-
Multiple Datasets: If your .mat file contains multiple variables, the script will create separate Parquet files for each, with numbered suffixes.
To read the Parquet file later, you can simply use:
# Reading the Parquet file
df = pd.read_parquet("output.parquet")
If you have a specific MATLAB file structure that needs special handling, please provide more details, and I can help modify the script accordingly.