Skip to content

Instantly share code, notes, and snippets.

@d4rkc0de
Last active September 13, 2017 16:58
Show Gist options
  • Save d4rkc0de/0601195571e51b787e1fa5f48dd35637 to your computer and use it in GitHub Desktop.
Save d4rkc0de/0601195571e51b787e1fa5f48dd35637 to your computer and use it in GitHub Desktop.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
# Importing the dataset
dataset = pd.read_csv('train.csv')
df = pd.DataFrame(dataset)
# Replace nan values in Embarked with most frequent element
df.Embarked = df.Embarked.fillna(df['Embarked'].value_counts().idxmax())
X = df.iloc[:, [2, 4, 5, 6, 7, 9 ,11]].values
y = df.iloc[:, 1].values
# check for nan
#df.Embarked.isnull().values.any()
# Replace nan values in Age with mean of values
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer = imputer.fit(X[:, 2:3])
X[:, 2:3] = imputer.transform(X[:, 2:3])
# Encoding categorical data
labelencoder_X = LabelEncoder()
X[:, 1] = labelencoder_X.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 6] = labelencoder_X_2.fit_transform(X[:, 6])
onehotencoder = OneHotEncoder(categorical_features = [1])
X = onehotencoder.fit_transform(X).toarray()
# Avoiding the dummy variable trap
X = X[:, 1:]
onehotencoder_2 = OneHotEncoder(categorical_features = [6])
X = onehotencoder_2.fit_transform(X).toarray()
# Avoiding the dummy variable trap
X = X[:, 1:]
# predicting with linear regression
regressor = LinearRegression()
regressor.fit(X,y)
# test part
dataset_test = pd.read_csv('test.csv')
df_test = pd.DataFrame(dataset_test)
X_test_id = df_test.iloc[:, [0]].values
# Replace nan values in Embarked with most frequent element
df_test.Embarked = df_test.Embarked.fillna(df_test['Embarked'].value_counts().idxmax())
X_test = df_test.iloc[:, [1, 3, 4, 5, 6, 8 ,10]].values
# check for nan
#df.Embarked.isnull().values.any()
for x in X_test:
print(x)
# Replace nan values in Age with mean of values
imputer_test = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer_test = imputer_test.fit(X_test[:, 2:3])
X_test[:, 2:3] = imputer_test.transform(X_test[:, 2:3])
imputer_test_2 = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer_test_2 = imputer_test_2.fit(X_test[:, 5:6])
X_test[:, 5:6] = imputer_test_2.transform(X_test[:, 5:6])
# Encoding categorical data
labelencoder_X_test = LabelEncoder()
X_test[:, 1] = labelencoder_X_test.fit_transform(X_test[:, 1])
labelencoder_X_2_test = LabelEncoder()
X_test[:, 6] = labelencoder_X_2_test.fit_transform(X_test[:, 6])
onehotencoder_test = OneHotEncoder(categorical_features = [1])
X_test = onehotencoder_test.fit_transform(X_test).toarray()
# Avoiding the dummy variable trap
X_test = X_test[:, 1:]
onehotencoder_2_test = OneHotEncoder(categorical_features = [6])
X_test = onehotencoder_2_test.fit_transform(X_test).toarray()
# Avoiding the dummy variable trap
X_test = X_test[:, 1:]
y_pred = regressor.predict(X_test)
y_pred = [round(x) for x in y_pred]
y_pred_int = [round(x) for x in y_pred]
y_pred_int = [int(i) for i in y_pred_int]
with open('pred.csv','wb') as file:
for i in range(len(y_pred_int)):
file.write(X_test_id[i] + ',' + y_pred_int[i])
file.write('\n')
file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment