tannnt810 · July 18, 2024 23:19
diff --git a/gistfile1.txt b/gistfile1.txt
 import pandas as pd
 import numpy as np
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import mean_squared_error, accuracy_score, r2_score
 import lightgbm as lgb
 import matplotlib.pyplot as plt

 # --- 1. Tạo dữ liệu dự đoán giá nhà ---
 np.random.seed(42)
 num_samples = 10000

 data = {
    'diện_tích': np.random.randint(30, 150, num_samples),
    'số_phòng_ngủ': np.random.randint(1, 5, num_samples),
    'khoảng_cách_trung_tâm': np.random.randint(1, 20, num_samples),
    'có_bể_bơi': np.random.choice([0, 1], num_samples)
 }
 df = pd.DataFrame(data)

 df['giá_nhà'] = (
    1000 + df['diện_tích'] * 20
    + df['số_phòng_ngủ'] * 500
    - df['khoảng_cách_trung_tâm'] * 100
    + df['có_bể_bơi'] * 1000
    + np.random.normal(0, 200, num_samples)
 )
 df['có_mua'] = np.where(df['giá_nhà'] < df['giá_nhà'].mean(), 1, 0)

 # --- In 10 dòng đầu tiên của dữ liệu ---
 print("10 dòng dữ liệu đầu tiên:")
 print(df.head(10))

 # --- 2. Chuẩn bị dữ liệu ---
 X = df.drop(['giá_nhà', 'có_mua'], axis=1)
 y_regression = df['giá_nhà']
 y_classification = df['có_mua']

 X_train, X_test, y_train_reg, y_test_reg, y_train_cls, y_test_cls = train_test_split(
    X, y_regression, y_classification, test_size=0.2, random_state=42
 )

 # --- 3. Huấn luyện mô hình với callbacks ---
 lgb_reg = lgb.LGBMRegressor()
 lgb_cls = lgb.LGBMClassifier()  # Khởi tạo mô hình phân loại

 # Định nghĩa callback (chỉ in log)
 lgb_reg.fit(
    X_train,
    y_train_reg,
    eval_set=[(X_test, y_test_reg)],
    callbacks=[
        lgb.early_stopping(stopping_rounds=10, verbose=True),
        lgb.log_evaluation(period=10)
    ]
 )

 lgb_cls.fit(
    X_train,
    y_train_cls,
    eval_set=[(X_test, y_test_cls)],
    callbacks=[
        lgb.early_stopping(stopping_rounds=10, verbose=True),
        lgb.log_evaluation(period=10)
    ]
 )

 # --- Vẽ biểu đồ sau khi huấn luyện xong ---
 evals_result = lgb_reg.evals_result_
 for dataset in evals_result:
    plt.plot(evals_result[dataset]['l2'], label=dataset)
 plt.xlabel('Số vòng lặp')
 plt.ylabel('MSE (l2)')
 plt.legend()
 plt.show()

 # --- 4. Dự đoán và đánh giá mô hình ---
 y_pred_reg = lgb_reg.predict(X_test)
 mse = mean_squared_error(y_test_reg, y_pred_reg)
 r2 = r2_score(y_test_reg, y_pred_reg)

 n = len(y_test_reg)
 p = X_test.shape[1]
 adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

 print(f"Mean Squared Error (Regression): {mse:.2f}")
 print(f"R-squared (Regression): {r2:.2f}")
 print(f"Adjusted R-squared (Regression): {adjusted_r2:.2f}")

 y_pred_cls = lgb_cls.predict(X_test)
 accuracy = accuracy_score(y_test_cls, y_pred_cls)
 print(f"Accuracy (Classification): {accuracy:.2f}")

 # --- 5. Nhập dữ liệu mới và dự đoán ---
 new_data = pd.DataFrame({
    'diện_tích': [80],
    'số_phòng_ngủ': [3],
    'khoảng_cách_trung_tâm': [5],
    'có_bể_bơi': [1]
 })

 predicted_value = lgb_reg.predict(new_data)
 predicted_class = lgb_cls.predict(new_data)

 print(f"Giá nhà dự đoán (triệu VND): {predicted_value[0]:.2f}")
 print(f"Có mua hay không (1: Có, 0: Không): {predicted_class[0]}")

 # Vẽ cây thứ 0 (bạn có thể thay đổi index để vẽ các cây khác)
 lgb.plot_tree(lgb_reg, tree_index=0, figsize=(30, 15))
 plt.show()
diff --git a/gistfile2.txt b/gistfile2.txt
 import pandas as pd
 import numpy as np
 from sklearn.model_selection import train_test_split, GridSearchCV
 from sklearn.metrics import mean_squared_error, r2_score, make_scorer
 import lightgbm as lgb
 import matplotlib.pyplot as plt

 # --- 1. Tạo dữ liệu dự đoán giá nhà ---
 np.random.seed(42)
 num_samples = 10000

 data = {
    'diện_tích': np.random.randint(30, 150, num_samples),
    'số_phòng_ngủ': np.random.randint(1, 5, num_samples),
    'khoảng_cách_trung_tâm': np.random.randint(1, 20, num_samples),
    'có_bể_bơi': np.random.choice([0, 1], num_samples)
 }
 df = pd.DataFrame(data)

 df['giá_nhà'] = (
    1000 + df['diện_tích'] * 20
    + df['số_phòng_ngủ'] * 500
    - df['khoảng_cách_trung_tâm'] * 100
    + df['có_bể_bơi'] * 1000
    + np.random.normal(0, 200, num_samples)
 )
 df['có_mua'] = np.where(df['giá_nhà'] < df['giá_nhà'].mean(), 1, 0)

 # --- In 10 dòng đầu tiên của dữ liệu ---
 print("10 dòng dữ liệu đầu tiên:")
 print(df.head(10))

 # --- 2. Chuẩn bị dữ liệu ---
 X = df.drop(['giá_nhà', 'có_mua'], axis=1)
 y_regression = df['giá_nhà']
 y_classification = df['có_mua']

 X_train, X_test, y_train_reg, y_test_reg, y_train_cls, y_test_cls = train_test_split(
    X, y_regression, y_classification, test_size=0.2, random_state=42
 )

 # --- 3. Tuning siêu tham số với GridSearchCV ---
 # Định nghĩa mô hình LightGBM
 lgb_model = lgb.LGBMRegressor()

 # Định nghĩa lưới các siêu tham số cần tuning
 param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7]
 }

 # Tạo đối tượng GridSearchCV
 # Sử dụng scoring là R-squared hiệu chỉnh (Adj. R2)
 grid_search = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    scoring=make_scorer(r2_score),
    cv=5,  # 5-fold cross-validation
    verbose=1,
    n_jobs=-1  # Sử dụng tất cả các core CPU
 )

 # Bắt đầu quá trình tìm kiếm
 grid_search.fit(X_train, y_train_reg)

 # Lấy mô hình tốt nhất sau khi tuning
 best_lgb_reg = grid_search.best_estimator_

 # --- 4. In kết quả tuning ---
 print("Kết quả tuning:")
 print("Siêu tham số tốt nhất:", grid_search.best_params_)
 print("Điểm số tốt nhất (Adj. R2):", grid_search.best_score_)

 # --- 3. Huấn luyện mô hình với callbacks ---
 lgb_reg = lgb.LGBMRegressor()

 # Danh sách lưu trữ R bình phương hiệu chỉnh
 adjusted_r2_scores = []

 def evaluate_model(env):
    """ Tính toán và lưu trữ R bình phương hiệu chỉnh sau mỗi epoch."""
    y_pred = env.model.predict(X_test)
    r2 = r2_score(y_test_reg, y_pred)
    n = len(y_test_reg)
    p = X_test.shape[1]
    adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
    adjusted_r2_scores.append(adjusted_r2)

 lgb_reg.fit(
    X_train,
    y_train_reg,
    eval_set=[(X_test, y_test_reg)],
    callbacks=[
        lgb.early_stopping(stopping_rounds=10, verbose=True),
        lgb.log_evaluation(period=10),
        evaluate_model,  # Gọi hàm tính toán adjusted R2
    ]
 )

 # --- Vẽ biểu đồ MSE và Adjusted R-squared ---
 evals_result = lgb_reg.evals_result_
 epochs = len(evals_result['valid_0']['l2'])

 plt.figure(figsize=(12, 5))

 # Biểu đồ MSE
 plt.subplot(1, 2, 1)
 for dataset in evals_result:
    plt.plot(evals_result[dataset]['l2'], label=dataset)
 plt.xlabel('Số vòng lặp')
 plt.ylabel('MSE (l2)')
 plt.legend()

 # Biểu đồ Adjusted R-squared
 plt.subplot(1, 2, 2)
 plt.plot(range(1, epochs + 1), adjusted_r2_scores, label='Adjusted R-squared')
 plt.xlabel('Số vòng lặp')
 plt.ylabel('Adjusted R-squared')
 plt.legend()

 plt.tight_layout()
 plt.show()

 # --- 4. Dự đoán và đánh giá mô hình ---
 y_pred_reg = lgb_reg.predict(X_test)
 mse = mean_squared_error(y_test_reg, y_pred_reg)
 r2 = r2_score(y_test_reg, y_pred_reg)

 n = len(y_test_reg)
 p = X_test.shape[1]
 adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

 print(f"Mean Squared Error (Regression): {mse:.2f}")
 print(f"R-squared (Regression): {r2:.2f}")
 print(f"Adjusted R-squared (Regression): {adjusted_r2:.2f}")

 y_pred_cls = lgb_cls.predict(X_test)
 accuracy = accuracy_score(y_test_cls, y_pred_cls)
 print(f"Accuracy (Classification): {accuracy:.2f}")

 # --- 5. Nhập dữ liệu mới và dự đoán ---
 new_data = pd.DataFrame({
    'diện_tích': [80],
    'số_phòng_ngủ': [3],
    'khoảng_cách_trung_tâm': [5],
    'có_bể_bơi': [1]
 })

 predicted_value = lgb_reg.predict(new_data)
 predicted_class = lgb_cls.predict(new_data)

 print(f"Giá nhà dự đoán (triệu VND): {predicted_value[0]:.2f}")
 print(f"Có mua hay không (1: Có, 0: Không): {predicted_class[0]}")
diff --git a/gistfile3.txt b/gistfile3.txt
 import pandas as pd
 import numpy as np
 from sklearn.model_selection import train_test_split, GridSearchCV
 from sklearn.metrics import mean_squared_error, r2_score, make_scorer
 from sklearn.model_selection import learning_curve, validation_curve
 import lightgbm as lgb
 import matplotlib.pyplot as plt

 # --- 1. Tạo dữ liệu dự đoán giá nhà ---
 np.random.seed(42)
 num_samples = 10000

 data = {
    'diện_tích': np.random.randint(30, 150, num_samples),
    'số_phòng_ngủ': np.random.randint(1, 5, num_samples),
    'khoảng_cách_trung_tâm': np.random.randint(1, 20, num_samples),
    'có_bể_bơi': np.random.choice([0, 1], num_samples)
 }
 df = pd.DataFrame(data)

 df['giá_nhà'] = (
    1000 + df['diện_tích'] * 20
    + df['số_phòng_ngủ'] * 500
    - df['khoảng_cách_trung_tâm'] * 100
    + df['có_bể_bơi'] * 1000
    + np.random.normal(0, 200, num_samples)
 )
 df['có_mua'] = np.where(df['giá_nhà'] < df['giá_nhà'].mean(), 1, 0)

 # --- In 10 dòng đầu tiên của dữ liệu ---
 print("10 dòng dữ liệu đầu tiên:")
 print(df.head(10))

 # --- 2. Chuẩn bị dữ liệu ---
 X = df.drop(['giá_nhà', 'có_mua'], axis=1)
 y_regression = df['giá_nhà']
 y_classification = df['có_mua']

 X_train, X_test, y_train_reg, y_test_reg, y_train_cls, y_test_cls = train_test_split(
    X, y_regression, y_classification, test_size=0.2, random_state=42
 )

 # --- 3. Tuning siêu tham số với GridSearchCV ---
 # Định nghĩa mô hình LightGBM
 lgb_model = lgb.LGBMRegressor()

 # Định nghĩa lưới các siêu tham số cần tuning
 param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7]
 }

 # Tạo đối tượng GridSearchCV
 grid_search = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    scoring=make_scorer(r2_score),
    cv=5,
    verbose=1,
    n_jobs=-1
 )

 # Bắt đầu quá trình tìm kiếm
 grid_search.fit(X_train, y_train_reg)

 # Lấy mô hình tốt nhất sau khi tuning
 best_lgb_reg = grid_search.best_estimator_

 # --- 4. In kết quả tuning ---
 print("Kết quả tuning:")
 print("Siêu tham số tốt nhất:", grid_search.best_params_)
 print("Điểm số tốt nhất (Adj. R2):", grid_search.best_score_)

 # --- 5. Vẽ biểu đồ MSE và Adjusted R-squared ---
 # Danh sách lưu trữ R bình phương hiệu chỉnh
 adjusted_r2_scores = []

 def evaluate_model(env):
    """Tính toán và lưu trữ R bình phương hiệu chỉnh sau mỗi epoch."""
    y_pred = env.model.predict(X_test)
    r2 = r2_score(y_test_reg, y_pred)
    n = len(y_test_reg)
    p = X_test.shape[1]
    adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
    adjusted_r2_scores.append(adjusted_r2)

 best_lgb_reg.fit(
    X_train,
    y_train_reg,
    eval_set=[(X_test, y_test_reg)],
    callbacks=[
        lgb.early_stopping(stopping_rounds=10, verbose=True),
        lgb.log_evaluation(period=10),
        evaluate_model
    ]
 )

 evals_result = best_lgb_reg.evals_result_
 epochs = len(evals_result['valid_0']['l2'])

 plt.figure(figsize=(12, 5))

 plt.subplot(1, 2, 1)
 for dataset in evals_result:
    plt.plot(evals_result[dataset]['l2'], label=dataset)
 plt.xlabel('Số vòng lặp')
 plt.ylabel('MSE (l2)')
 plt.legend()

 plt.subplot(1, 2, 2)
 plt.plot(range(1, epochs + 1), adjusted_r2_scores, label='Adjusted R-squared')
 plt.xlabel('Số vòng lặp')
 plt.ylabel('Adjusted R-squared')
 plt.legend()

 plt.tight_layout()
 plt.show()

 # --- 6. Vẽ biểu đồ Learning Curve ---
 train_sizes, train_scores, test_scores = learning_curve(
    best_lgb_reg, X_train, y_train_reg, cv=5, scoring='r2',
    train_sizes=np.linspace(0.1, 1.0, 10)
 )

 plt.figure(figsize=(8, 6))
 plt.plot(train_sizes, np.mean(train_scores, axis=1), label='Huấn luyện')
 plt.plot(train_sizes, np.mean(test_scores, axis=1), label='Kiểm tra')
 plt.xlabel('Kích thước tập huấn luyện')
 plt.ylabel('R-squared')
 plt.title('Learning Curve')
 plt.legend()
 plt.show()

 # --- 7. Vẽ biểu đồ Validation Curve ---
 param_range = param_grid['n_estimators']  # Lấy khoảng giá trị của n_estimators
 train_scores, test_scores = validation_curve(
    lgb_model, X_train, y_train_reg,
    param_name='n_estimators', param_range=param_range,
    cv=5, scoring='r2'
 )

 plt.figure(figsize=(8, 6))
 plt.plot(param_range, np.mean(train_scores, axis=1), label='Huấn luyện')
 plt.plot(param_range, np.mean(test_scores, axis=1), label='Kiểm tra')
 plt.xlabel('Số lượng cây (n_estimators)')
 plt.ylabel('R-squared')
 plt.title('Validation Curve (n_estimators)')
 plt.legend()
 plt.show()

 # --- 8. Dự đoán và đánh giá mô hình ---
 y_pred_reg = best_lgb_reg.predict(X_test)
 mse = mean_squared_error(y_test_reg, y_pred_reg)
 r2 = r2_score(y_test_reg, y_pred_reg)

 n = len(y_test_reg)
 p = X_test.shape[1]
 adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

 print(f"Mean Squared Error (Regression): {mse:.2f}")
 print(f"R-squared (Regression): {r2:.2f}")
 print(f"Adjusted R-squared (Regression): {adjusted_r2:.2f}")

 y_pred_cls = lgb_cls.predict(X_test)
 accuracy = accuracy_score(y_test_cls, y_pred_cls)
 print(f"Accuracy (Classification): {accuracy:.2f}")

 # --- 9. Nhập dữ liệu mới và dự đoán ---
 new_data = pd.DataFrame({
    'diện_tích': [80],
    'số_phòng_ngủ': [3],
    'khoảng_cách_trung_tâm': [5],
    'có_bể_bơi': [1]
 })

 predicted_value = best_lgb_reg.predict(new_data)
 predicted_class = lgb_cls.predict(new_data)

 print(f"Giá nhà dự đoán (triệu VND): {predicted_value[0]:.2f}")
 print(f"Có mua hay không (1: Có, 0: Không): {predicted_class[0]}")
	import pandas as pd
	import numpy as np
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import mean_squared_error, accuracy_score, r2_score
	import lightgbm as lgb
	import matplotlib.pyplot as plt

	# --- 1. Tạo dữ liệu dự đoán giá nhà ---
	np.random.seed(42)
	num_samples = 10000

	data = {
	'diện_tích': np.random.randint(30, 150, num_samples),
	'số_phòng_ngủ': np.random.randint(1, 5, num_samples),
	'khoảng_cách_trung_tâm': np.random.randint(1, 20, num_samples),
	'có_bể_bơi': np.random.choice([0, 1], num_samples)
	}
	df = pd.DataFrame(data)

	df['giá_nhà'] = (
	1000 + df['diện_tích'] * 20
	+ df['số_phòng_ngủ'] * 500
	- df['khoảng_cách_trung_tâm'] * 100
	+ df['có_bể_bơi'] * 1000
	+ np.random.normal(0, 200, num_samples)
	)
	df['có_mua'] = np.where(df['giá_nhà'] < df['giá_nhà'].mean(), 1, 0)

	# --- In 10 dòng đầu tiên của dữ liệu ---
	print("10 dòng dữ liệu đầu tiên:")
	print(df.head(10))

	# --- 2. Chuẩn bị dữ liệu ---
	X = df.drop(['giá_nhà', 'có_mua'], axis=1)
	y_regression = df['giá_nhà']
	y_classification = df['có_mua']

	X_train, X_test, y_train_reg, y_test_reg, y_train_cls, y_test_cls = train_test_split(
	X, y_regression, y_classification, test_size=0.2, random_state=42
	)

	# --- 3. Huấn luyện mô hình với callbacks ---
	lgb_reg = lgb.LGBMRegressor()
	lgb_cls = lgb.LGBMClassifier() # Khởi tạo mô hình phân loại

	# Định nghĩa callback (chỉ in log)
	lgb_reg.fit(
	X_train,
	y_train_reg,
	eval_set=[(X_test, y_test_reg)],
	callbacks=[
	lgb.early_stopping(stopping_rounds=10, verbose=True),
	lgb.log_evaluation(period=10)
	]
	)

	lgb_cls.fit(
	X_train,
	y_train_cls,
	eval_set=[(X_test, y_test_cls)],
	callbacks=[
	lgb.early_stopping(stopping_rounds=10, verbose=True),
	lgb.log_evaluation(period=10)
	]
	)

	# --- Vẽ biểu đồ sau khi huấn luyện xong ---
	evals_result = lgb_reg.evals_result_
	for dataset in evals_result:
	plt.plot(evals_result[dataset]['l2'], label=dataset)
	plt.xlabel('Số vòng lặp')
	plt.ylabel('MSE (l2)')
	plt.legend()
	plt.show()

	# --- 4. Dự đoán và đánh giá mô hình ---
	y_pred_reg = lgb_reg.predict(X_test)
	mse = mean_squared_error(y_test_reg, y_pred_reg)
	r2 = r2_score(y_test_reg, y_pred_reg)

	n = len(y_test_reg)
	p = X_test.shape[1]
	adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

	print(f"Mean Squared Error (Regression): {mse:.2f}")
	print(f"R-squared (Regression): {r2:.2f}")
	print(f"Adjusted R-squared (Regression): {adjusted_r2:.2f}")

	y_pred_cls = lgb_cls.predict(X_test)
	accuracy = accuracy_score(y_test_cls, y_pred_cls)
	print(f"Accuracy (Classification): {accuracy:.2f}")

	# --- 5. Nhập dữ liệu mới và dự đoán ---
	new_data = pd.DataFrame({
	'diện_tích': [80],
	'số_phòng_ngủ': [3],
	'khoảng_cách_trung_tâm': [5],
	'có_bể_bơi': [1]
	})

	predicted_value = lgb_reg.predict(new_data)
	predicted_class = lgb_cls.predict(new_data)

	print(f"Giá nhà dự đoán (triệu VND): {predicted_value[0]:.2f}")
	print(f"Có mua hay không (1: Có, 0: Không): {predicted_class[0]}")

	# Vẽ cây thứ 0 (bạn có thể thay đổi index để vẽ các cây khác)
	lgb.plot_tree(lgb_reg, tree_index=0, figsize=(30, 15))
	plt.show()