用几种办法来减少dataframe占用的内存:
- 去掉信息重复的columns
- 提前去掉不需要的行
- 转换数字到最小精度(-50%)
- 转换Python string (objects)为pyarrow str (-30%)
- 转换date string为pd datetime (-85%)
- 转换大量重复出现的string为category (-95%)
转换之前之后的内存比较, 占用内存变成了原来的1/5:
Initial shape: (50000, 36)
Initial memory usage: 214.97 MB
Final shape: (42821, 25)
Final memory usage: 41.24 MB
Memory usage per column:
代码:
import pandas as pd
import unibox as ub
def human_readable_size(size_bytes):
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
if size_bytes < 1024:
return f"{size_bytes:.2f} {unit}"
size_bytes /= 1024
# Function to display memory usage of each column
def column_memory_usage(dataframe):
memory_usage = dataframe.memory_usage(deep=True)
readable_memory_usage = memory_usage.apply(human_readable_size)
dtypes = dataframe.dtypes
mem_df = pd.DataFrame({
'Column': memory_usage.index[1:], # Exclude Index
'Memory Usage': memory_usage.values[1:], # Exclude Index
'Readable Memory Usage': readable_memory_usage.values[1:], # Exclude Index
'Dtype': dtypes.values
})
mem_df = mem_df.sort_values(by='Memory Usage', ascending=False)
return mem_df
def convert_64_numerics_to_32(df):
# Reduce precision of numerical columns using to_numeric with downcast
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
for col in numerical_columns:
df[col] = pd.to_numeric(df[col], downcast='integer' if df[col].dtype == 'int64' else 'float')
return df
def convert_object_to_category(df, category_columns):
for col in category_columns:
if df[col].isna().sum() > 0: # Check for NaN values
df[col] = pd.Categorical(df[col], categories=df[col].dropna().unique())
df[col] = df[col].astype(pd.SparseDtype(dtype='category', fill_value=np.nan))
else:
df[col] = df[col].astype('category')
return df
def join_hashtags(hashtags):
if isinstance(hashtags, (list, np.ndarray)):
return ", ".join(hashtags)
elif pd.isna(hashtags):
return ""
return hashtags
def convert_object_to_pyarrow_string(df):
# 没被定义成category的object column, 会被默认转成pyarrow string
for col in df.select_dtypes(include=['object']).columns:
try:
# Attempt to convert to pyarrow string type
df[col] = df[col].astype("string[pyarrow]")
except Exception as e:
print(f"Failed to convert column {col} to pyarrow string: {e}")
return df
# Load the DataFrame
df = ub.loads("s3://dataset-ingested/sagemaker/20240614_gdl_twitter_all/0.merged_prefiltered.parquet")
# Initial DataFrame shape and memory usage
print(f"Initial shape: {df.shape}")
initial_memory_usage = df.memory_usage(deep=True).sum()
print(f"Initial memory usage: {human_readable_size(initial_memory_usage)}")
# Filter and drop columns
DROP_COLS = ["clip__clip_scores", "debug_cs_comic_page", "metadata_s3_uri", "twitter__tweet_id", "user_handle", "twitter__author_name",
"twitter__subcategory", "twitter__category", "twitter__extension",
"twitter__description", "twitter__content"] # things might not be needed but NOT FOR NOW
df = df[df["prefilter_kept"]]
df = df.dropna(subset=["clip__clip_scores"])
df = df.drop(columns=DROP_COLS)
# Convert the twitter__hashtags column
df['twitter__hashtags'] = df['twitter__hashtags'].apply(join_hashtags)
df["twitter__date"] = pd.to_datetime(df["twitter__date"])
# Convert numerical columns to smaller precision
df = convert_64_numerics_to_32(df)
# Identify object columns to convert to category, including sparse categories for columns with NaNs
category_columns = ["clip__softmax_prompt", "wd__wd_rating", "wd__wd_character",
"twitter__hashtags",]
df = convert_object_to_category(df, category_columns)
# Convert object columns to pyarrow string type
df = convert_object_to_pyarrow_string(df)
# Final DataFrame shape and memory usage
print(f"Final shape: {df.shape}")
final_memory_usage = df.memory_usage(deep=True).sum()
print(f"Final memory usage: {human_readable_size(final_memory_usage)}")
# Display memory usage per column
print("Memory usage per column:")
mem_df = column_memory_usage(df)
mem_df