Last active
June 1, 2023 22:49
-
-
Save zograf/7aefd45d4f594a76d61c9607fc7dee7b to your computer and use it in GitHub Desktop.
ORI kolokvijum 2 kod
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import re | |
from sklearn.naive_bayes import MultinomialNB | |
from sklearn.metrics import f1_score, accuracy_score | |
from sklearn.neural_network import MLPClassifier, MLPRegressor | |
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, MinMaxScaler | |
from sklearn.model_selection import train_test_split | |
from sklearn.svm import SVC | |
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier, AdaBoostClassifier | |
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer | |
from sklearn.datasets import load_iris | |
from sklearn.cluster import KMeans, DBSCAN | |
import matplotlib.pyplot as plt | |
#### UTILITY #### | |
def calculate_rmse(predicted, true): | |
return np.sqrt(((predicted - true) ** 2).mean()) | |
def remove_outliers(df_in, col_name, scale=1.5): | |
q1 = df_in[col_name].quantile(0.25) | |
q3 = df_in[col_name].quantile(0.75) | |
iqr = q3-q1 | |
fence_low = q1-scale*iqr | |
fence_high = q3+scale*iqr | |
df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)] | |
return df_out | |
#### MAIN CODE #### | |
# Ucitati i proveriti na sta lici | |
df = pd.read_csv("./data/train.csv") | |
print(df) | |
# Proveriti da li su kolone izbalansirane | |
df["naziv_kolone"].value_counts().plot.bar() | |
plt.show() | |
# Izbaciti redove cije kolone koje nemaju vrednosti | |
df = df.dropna() | |
# LabelEncoder -> Dodeljuje brojeve kategorickim obelezjima. | |
# Nekad nije dobro jer neke kategorije dobiju veci broj | |
lenc = LabelEncoder() | |
df['naziv_kolone'] = lenc.fit_transform(df['naziv_kolone']) | |
# OneHotEncoder -> Napravice od pola recimo pol_Male, pol_Female i oblast_A, oblast_b | |
# dodelice vrednosti True i False kolonama | |
df = pd.get_dummies(df, columns=['oblast' ,'pol'], drop_first=True) | |
# Drop first ce droppovati pol_Female ali mozemo i rucno droppovati sa komandom ispod | |
df = df.drop('pol_Male', axis=1) | |
# Delimo na trening i testni skup | |
train, test = train_test_split(df, test_size=0.3, random_state=42) | |
# Provera boxplotom da li ima outliera | |
plt.boxplot(train['naziv_kolone']) | |
plt.show() | |
# Uklanjanje outliera | |
train = remove_outliers(train, 'naziv_kolone') | |
# x_train su kolone na osnovu kojih ucimo (sve sem ove sto predvidjamo) | |
# y_train je kolona koju predvidjamo | |
x_train = train.drop("zvanje", axis=1) | |
y_train = train['zvanje'] | |
# Normalizacija oblika z = (x-mean)/sd | |
# dobijaju se vrednosti izmedju 0 i 1 | |
st = StandardScaler() | |
# Fit ide samo na trening podatke | |
st.fit(x_train) | |
# Nakon fita transformisati trening podatke | |
x_train[x_train.columns] = st.transform(x_train[x_train.columns]) | |
# Delimo testni skup kao malopre trening skup | |
x_test = test.drop("zvanje", axis=1) | |
y_test = test['zvanje'] | |
# odraditi SAMO transformaciju (bez fita) nad test podacima | |
x_test[x_test.columns] = st.transform(x_test[x_test.columns]) | |
# Trebalo bi imati 1-5 layera | |
# Layeri da budu stepen dvojke i da opadaju (recimo [64, 32]) | |
# Ukupan broj neurona u layerima da bude oko ~128 | |
# Learning rate da bude izmedju 10^-1 i 10^-6 | |
# Koristiti 0.01 najcesce za ove male mreze, 0.001 za neke dublje mreze | |
# KLASIFIKACIJA | |
mlp = MLPClassifier(hidden_layer_sizes=[50,50,20],learning_rate_init=0.01, max_iter=1000,verbose=True, random_state=42).fit(x_train, y_train) | |
# REGRESIJA | |
mlp = MLPRegressor(hidden_layer_sizes=[50, 50, 20], learning_rate_init=0.01, max_iter=20000, verbose=True, random_state=42).fit(x_train, y_train) | |
y_pred = mlp.predict(x_test) | |
# Crtanje krive da vidimo kako parametri uticu na podatke | |
plt.plot(np.arange(len(mlp.loss_curve_)), mlp.loss_curve_) | |
plt.show() | |
# F1 metrika (klasifikacija) | |
F1 = f1_score(y_pred, y_test, average='micro') | |
print(f'F1: {F1}') | |
# RMSE metrika (regresija) | |
RMSE = calculate_rmse(y_pred, y_test) | |
print(f'RMSE: {RMSE}') | |
#### NAIVNI BAJES #### | |
# Enkodovanje labela | |
df['Character'] = LabelEncoder().fit_transform(df['Character']) | |
# Transformacija teksta | |
df['Line'] = df['Line'].apply(lambda x: x.lower()) | |
df['Line'] = df['Line'].apply(lambda x: re.sub(r"[^\w\s]","", x)) | |
df['Line'] = df['Line'].apply(lambda x: " ".join([word for word in x.split() if len(word) >= 2])) | |
# Mozemo i ovako odvojiti train i test, ako unapred zadamo koji je y | |
x_train, x_test, y_train, y_test = train_test_split(df['Line'], df['Character'], test_size=0.2, random_state=496) | |
# Izbaciti stop reci, ngram_range cini koliko reci posmatramo | |
# da li same reci, grupe od 2 reci, od 3... | |
vect = CountVectorizer(stop_words="english", ngram_range=((1,2))) | |
x_train = vect.fit_transform(x_train) | |
x_test = vect.transform(x_test) | |
# Alternativa za CountVectorizer | |
vect = TfidfVectorizer(stop_words='english', ngram_range=((1,1))) | |
x_train = vect.fit_transform(x_train) | |
x_test = vect.transform(x_test) | |
# Primena bayesa | |
nb = MultinomialNB().fit(x_train, y_train) | |
y_pred = nb.predict(x_test) | |
# METRIKA | |
accuracy = accuracy_score(y_pred, y_test) | |
print(f'Accuracy: {accuracy}') | |
#### KLASTEROVANJE #### | |
df = pd.read_csv('bank.csv') | |
print(df) | |
lenc = LabelEncoder() | |
df['id'] = lenc.fit_transform(df['id']) | |
df['sex'] = lenc.fit_transform(df['sex']) | |
df['region'] = lenc.fit_transform(df['region']) | |
df['married'] = lenc.fit_transform(df['married']) | |
df['car'] = lenc.fit_transform(df['car']) | |
df['save_act'] = lenc.fit_transform(df['save_act']) | |
df['current_act'] = lenc.fit_transform(df['current_act']) | |
df['mortgage'] = lenc.fit_transform(df['mortgage']) | |
df['pep'] = lenc.fit_transform(df['pep']) | |
print(df) | |
X = df[['id', 'age', 'sex', 'region', 'income', 'married', 'children', 'car', 'save_act', 'current_act', 'mortgage', 'pep']] | |
kmeans = KMeans(n_clusters=10, random_state=42) | |
kmeans.fit(X) | |
labels = kmeans.labels_ | |
dbscan = DBSCAN(eps=0.02, min_samples=5) | |
dbscan.fit(X) | |
labels = dbscan.labels_ | |
plt.scatter(X['age'], X['income'], c=labels) | |
plt.show() | |
### RESENJE ZADATAKA SA PRIPREME ### | |
# PRVI # | |
import pandas as pd | |
from sklearn.neural_network import MLPRegressor | |
from sklearn.model_selection import train_test_split | |
df = pd.read_csv('customer_churn.csv') | |
X = df[['total intl minutes']] | |
y = df['churn'] | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
mlp = MLPRegressor(hidden_layer_sizes=(128, 64), activation='relu', solver='adam', random_state=42) | |
mlp.fit(X_train, y_train) | |
# Predict the probabilities for new values | |
new_X = [[5], [60]] # New input values | |
predictions = mlp.predict(new_X) | |
for i, x in enumerate(new_X): | |
print(f"{x[0]} minuta: {predictions[i]}") | |
# DRUGI # | |
# Učitavanje podataka iz CSV fajla | |
df = pd.read_csv('customer_churn.csv') | |
X = df[['total intl minutes', 'total day minutes']] | |
# Kreiranje instance KMeans modela sa 2 klastera | |
kmeans = KMeans(n_clusters=2, random_state=42) | |
# Klasterovanje podataka | |
kmeans.fit(X) | |
# Dodavanje informacija o klasterima u DataFrame | |
df['cluster'] = kmeans.labels_ | |
# Brojanje zaposlenih koji su napustili kompaniju po klasterima | |
churn_counts = df.groupby('cluster')['churn'].sum() | |
# Ukupan broj zaposlenih po klasterima | |
total_counts = df['cluster'].value_counts() | |
# Izračunavanje procenata napuštanja kompanije po klasterima | |
churn_percentages = churn_counts / total_counts * 100 | |
# Ispisivanje rezultata | |
for cluster, percentage in churn_percentages.items(): | |
print(f"Procenat napuštanja kompanije: {percentage}%") | |
# Plotiranje klastera | |
plt.scatter(X['total intl minutes'], X['total day minutes'], c=df['cluster']) | |
plt.show() | |
# TRECI # | |
data = pd.read_csv('customer_churn.csv') | |
lenc = LabelEncoder() | |
data['international plan'] = lenc.fit_transform(df['international plan']) | |
data['voice mail plan'] = lenc.fit_transform(df['voice mail plan']) | |
# Select the relevant features and target variable | |
X = data[['international plan', 'voice mail plan', 'number vmail messages', 'total intl calls', 'total night calls', 'total day calls']] | |
y = data['churn'] | |
# Split the data into train and test sets | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) | |
# Create an instance of the MLPClassifier | |
model = MLPClassifier(hidden_layer_sizes=(128, 64), activation='relu', solver='adam', random_state=42) | |
# Fit the model to the training data | |
model.fit(X_train, y_train) | |
# Make predictions on the test set | |
predictions = model.predict(X_test) | |
# Calculate the accuracy of the model | |
accuracy = accuracy_score(y_test, predictions) | |
# Print the accuracy | |
print("Accuracy on the test set:", accuracy) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment