pandas_memory_optimize.md

用几种办法来减少dataframe占用的内存:

去掉信息重复的columns
提前去掉不需要的行
转换数字到最小精度(-50%)
转换Python string (objects)为pyarrow str (-30%)
转换date string为pd datetime (-85%)
转换大量重复出现的string为category (-95%)

转换之前之后的内存比较, 占用内存变成了原来的1/5:

Initial shape: (50000, 36)
Initial memory usage: 214.97 MB
Final shape: (42821, 25)
Final memory usage: 41.24 MB
Memory usage per column:

代码:

import pandas as pd
import unibox as ub

def human_readable_size(size_bytes):
    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
        if size_bytes < 1024:
            return f"{size_bytes:.2f} {unit}"
        size_bytes /= 1024

# Function to display memory usage of each column
def column_memory_usage(dataframe):
    memory_usage = dataframe.memory_usage(deep=True)
    readable_memory_usage = memory_usage.apply(human_readable_size)
    dtypes = dataframe.dtypes
    
    mem_df = pd.DataFrame({
        'Column': memory_usage.index[1:],  # Exclude Index
        'Memory Usage': memory_usage.values[1:],  # Exclude Index
        'Readable Memory Usage': readable_memory_usage.values[1:],  # Exclude Index
        'Dtype': dtypes.values
    })
    mem_df = mem_df.sort_values(by='Memory Usage', ascending=False)
    return mem_df

def convert_64_numerics_to_32(df):
    # Reduce precision of numerical columns using to_numeric with downcast
    numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
    for col in numerical_columns:
        df[col] = pd.to_numeric(df[col], downcast='integer' if df[col].dtype == 'int64' else 'float')
    return df

def convert_object_to_category(df, category_columns):
    for col in category_columns:
        if df[col].isna().sum() > 0:  # Check for NaN values
            df[col] = pd.Categorical(df[col], categories=df[col].dropna().unique())
            df[col] = df[col].astype(pd.SparseDtype(dtype='category', fill_value=np.nan))
        else:
            df[col] = df[col].astype('category')
    return df

def join_hashtags(hashtags):
    if isinstance(hashtags, (list, np.ndarray)):
        return ", ".join(hashtags)
    elif pd.isna(hashtags):
        return ""
    return hashtags

def convert_object_to_pyarrow_string(df):
    # 没被定义成category的object column, 会被默认转成pyarrow string
    for col in df.select_dtypes(include=['object']).columns:
        try:
            # Attempt to convert to pyarrow string type
            df[col] = df[col].astype("string[pyarrow]")
        except Exception as e:
            print(f"Failed to convert column {col} to pyarrow string: {e}")
    return df

# Load the DataFrame
df = ub.loads("s3://dataset-ingested/sagemaker/20240614_gdl_twitter_all/0.merged_prefiltered.parquet")

# Initial DataFrame shape and memory usage
print(f"Initial shape: {df.shape}")
initial_memory_usage = df.memory_usage(deep=True).sum()
print(f"Initial memory usage: {human_readable_size(initial_memory_usage)}")

# Filter and drop columns
DROP_COLS = ["clip__clip_scores", "debug_cs_comic_page", "metadata_s3_uri", "twitter__tweet_id", "user_handle", "twitter__author_name",
             "twitter__subcategory", "twitter__category", "twitter__extension", 
             "twitter__description", "twitter__content"]  # things might not be needed but NOT FOR NOW

df = df[df["prefilter_kept"]]
df = df.dropna(subset=["clip__clip_scores"])
df = df.drop(columns=DROP_COLS)

# Convert the twitter__hashtags column
df['twitter__hashtags'] = df['twitter__hashtags'].apply(join_hashtags)
df["twitter__date"] = pd.to_datetime(df["twitter__date"])

# Convert numerical columns to smaller precision
df = convert_64_numerics_to_32(df)

# Identify object columns to convert to category, including sparse categories for columns with NaNs
category_columns = ["clip__softmax_prompt", "wd__wd_rating", "wd__wd_character",
                    "twitter__hashtags",]
df = convert_object_to_category(df, category_columns)

# Convert object columns to pyarrow string type
df = convert_object_to_pyarrow_string(df)

# Final DataFrame shape and memory usage
print(f"Final shape: {df.shape}")
final_memory_usage = df.memory_usage(deep=True).sum()
print(f"Final memory usage: {human_readable_size(final_memory_usage)}")

# Display memory usage per column
print("Memory usage per column:")
mem_df = column_memory_usage(df)
mem_df

trojblue/pandas_memory_optimize.md