Skip to content

Instantly share code, notes, and snippets.

View MariaLavrovskaya's full-sized avatar

Maria MariaLavrovskaya

  • London, United Kingdom
View GitHub Profile
#Calculating the correlation matrix using matplotlib
data.corr().style.background_gradient(cmap='PuOr')
@MariaLavrovskaya
MariaLavrovskaya / airbnb_1.py
Last active October 13, 2019 11:42
airbnb_1
import pandas as pd
from scipy import median
from scipy.stats import mode
data=pd.read_csv('AB_NYC_2019.csv')
data.head()
data.describe().T
#Calculating the median for one of the rows
reviews= data.loc[:,['number_of_reviews']]
reviews.dropna()
#Slope for every predictor
df = pd.DataFrame({'Actual': Y_test.values.flatten(), 'Predicted': y_pred.flatten()})
df.head(25)
from sklearn.metrics import mean_squared_error, median_squared_error, r2_score
print("Mean squared error: %.2f"
% mean_squared_error(Y_test, y_pred))
print("Median squared error: %.2f"
% median_squared_error(Y_test, y_pred))
#Slope for every predictor
df = pd.DataFrame({'Actual': Y_test.values.flatten(), 'Predicted': y_pred.flatten()})
df.head(25)
from sklearn.metrics import mean_squared_error, r2_score
print("Mean squared error: %.2f"
% mean_squared_error(Y_test, y_pred))
from sklearn import linear_model
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_2, y,test_size = 0.25, random_state = 0)
# Create linear regression object
regr = linear_model.LinearRegression(fit_intercept=False) # Do not use fit_intercept = False if you have removed 1 column after dummy encoding
# Train the model using the training sets
regr.fit(X_train, Y_train)
y_pred = regr.predict(X_test)
# Resulted feature matrix with all of independent variables
X_2 = np.concatenate((scaled_columns,X_train_ohe),axis=1)
#Treating continous variables with Standart Scaler
columns_to_scale = np.array(df_1['runtime'])
#Initiate Scaler:
scaler = StandardScaler()
scaled_columns = scaler.fit_transform(columns_to_scale[:, np.newaxis])
#From labels to dummy
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)
X_train_ohe = ohe.fit_transform(X_train_le)
#Treating categorical variables with One-hot-encoding
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
# LabelEncoder for a number of columns
class MultiColumnLabelEncoder:
def __init__(self, columns = None):
self.columns = columns # list of column to encode
#Dropping missing values from my dataset
df_1.dropna(how='any', inplace=True)
print(df_1.isnull().values.sum()) #checking for missing values after the dropna()