Skip to content

Instantly share code, notes, and snippets.

@jakehemmerle
Created October 30, 2024 16:18
Show Gist options
  • Save jakehemmerle/3203efa01ac455c2a8a26669eb465af2 to your computer and use it in GitHub Desktop.
Save jakehemmerle/3203efa01ac455c2a8a26669eb465af2 to your computer and use it in GitHub Desktop.
Apple Silicon: Token Generation as a function of GPU-core count and Memory Bandwidth
# Corrected code based on column label error in previous attempt
import pandas as pd
import matplotlib.pyplot as plt
# Define the data
data = {
'Model': ['M1', 'M1', 'M1 Pro', 'M1 Pro', 'M1 Max', 'M1 Max', 'M1 Ultra', 'M1 Ultra',
'M2', 'M2', 'M2 Pro', 'M2 Pro', 'M2 Max', 'M2 Max', 'M2 Ultra', 'M2 Ultra',
'M3', 'M3', 'M3 Pro', 'M3 Pro', 'M3 Max', 'M3 Max', 'M3 Ultra', 'M3 Ultra'],
'BW_GB_s': [68, 68, 200, 200, 400, 400, 800, 800, 100, 100, 200, 200, 400, 400, 800, 800, 100, 100, 150, 150, 300, 400, 800, 800],
'GPU_Cores': [7, 8, 14, 16, 24, 32, 48, 64, 8, 10, 16, 19, 30, 38, 60, 76, 8, 10, 14, 18, 30, 40, 60, 80],
'F16_TG_t_s': [0, 0, 12.75, 12.75, 22.55, 23.03, 33.92, 37.01, 0, 6.72, 12.47, 13.06, 24.16, 24.65, 39.86, 41.02, 0, 0, 0, 9.89, 19.54, 25.09, 0, 0],
'Q8_0_TG_t_s': [7.92, 7.91, 21.95, 22.34, 37.81, 40.2, 55.69, 59.87, 12.18, 12.21, 22.7, 23.01, 39.97, 41.83, 62.14, 66.64, 0, 12.27, 17.44, 17.53, 34.3, 42.75, 0, 0],
'Q4_0_TG_t_s': [14.19, 14.15, 35.52, 36.41, 54.61, 61.19, 74.93, 83.73, 21.7, 21.91, 37.87, 38.86, 60.99, 65.95, 88.64, 94.27, 0, 21.34, 30.65, 30.74, 56.58, 66.31, 0, 0]
}
# Create a dataframe
df = pd.DataFrame(data)
# Create the figures
plt.figure(figsize=(12, 8))
# F16 TG Visualization relative to Memory Bandwidth
plt.subplot(1, 2, 1)
plt.scatter(df['BW_GB_s'], df['F16_TG_t_s'], color='blue', label='F16 TG', alpha=0.5)
plt.scatter(df['BW_GB_s'], df['Q8_0_TG_t_s'], color='orange', label='Q8 TG', alpha=0.5)
plt.scatter(df['BW_GB_s'], df['Q4_0_TG_t_s'], color='green', label='Q4 TG', alpha=0.5)
plt.title('TG vs Memory Bandwidth')
plt.xlabel('Memory Bandwidth (GB/s)')
plt.ylabel('Token Generation (t/s)')
plt.xlim(0, 1000)
plt.ylim(0, 100)
plt.legend()
plt.grid()
# F16 TG Visualization relative to GPU Cores
plt.subplot(1, 2, 2)
plt.scatter(df['GPU_Cores'], df['F16_TG_t_s'], color='blue', label='F16 TG', alpha=0.5)
plt.scatter(df['GPU_Cores'], df['Q8_0_TG_t_s'], color='orange', label='Q8 TG', alpha=0.5)
plt.scatter(df['GPU_Cores'], df['Q4_0_TG_t_s'], color='green', label='Q4 TG', alpha=0.5)
plt.title('TG vs GPU Cores')
plt.xlabel('GPU Cores')
plt.ylabel('Token Generation (t/s)')
plt.xlim(0, 100)
plt.ylim(0, 100)
plt.legend()
plt.grid()
plt.tight_layout()
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment