Created
April 3, 2025 22:03
-
-
Save ryanpadilha/137f4c0075127b9a4cab1bf0b011040b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from sklearn.model_selection import train_test_split | |
from sklearn.preprocessing import OneHotEncoder | |
from sklearn.compose import ColumnTransformer | |
from sklearn.pipeline import Pipeline | |
from sklearn.ensemble import RandomForestRegressor | |
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score | |
from google.colab import drive | |
drive.mount('/content/drive') | |
file_path = "/content/drive/MyDrive/mex-data/CarPrice_Assignment.csv" | |
df = pd.read_csv(file_path) | |
# Drop unnecessary columns | |
df = df.drop(columns=['car_ID', 'CarName']) | |
# Identify categorical and numerical columns | |
categorical_cols = df.select_dtypes(include=['object']).columns.tolist() | |
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist() | |
numerical_cols.remove('price') # Target variable | |
# Define preprocessing pipeline | |
preprocessor = ColumnTransformer( | |
transformers=[ | |
('num', 'passthrough', numerical_cols), | |
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols) | |
]) | |
# Split dataset into train (70%) and test (30%) | |
X = df.drop(columns=['price']) | |
y = df['price'] | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) | |
# Define model pipeline | |
model = Pipeline(steps=[ | |
('preprocessor', preprocessor), | |
('regressor', RandomForestRegressor(n_estimators=100, random_state=42)) | |
]) | |
# Train the model | |
model.fit(X_train, y_train) | |
# Predict on test set | |
y_pred = model.predict(X_test) | |
# Evaluate model performance | |
mae = mean_absolute_error(y_test, y_pred) | |
mse = mean_squared_error(y_test, y_pred) | |
rmse = np.sqrt(mse) | |
r2 = r2_score(y_test, y_pred) | |
# Print performance metrics | |
print(f"Mean Absolute Error (MAE): {mae:.2f}") | |
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}") | |
print(f"R² Score: {r2:.4f}") | |
# Scatter plot of actual vs. predicted prices | |
plt.figure(figsize=(8,6)) | |
plt.scatter(y_test, y_pred, alpha=0.7, color='blue', label="Predicted vs. Actual") | |
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--', label="Perfect Prediction") | |
plt.xlabel("Actual Price") | |
plt.ylabel("Predicted Price") | |
plt.title("Actual vs. Predicted Car Prices") | |
plt.legend() | |
plt.grid(True) | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment