Skip to content

Instantly share code, notes, and snippets.

@Manikant92
Manikant92 / smallberta_pretraining.ipynb
Created February 27, 2020 14:30 — forked from aditya-malte/smallberta_pretraining.ipynb
smallBERTa_Pretraining.ipynb
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
#follow same process which was discussed before converting lists into dictionary then dataframe
new_dictionary = {'X': new_x, 'y':new_y}
new_df = pd.DataFrame(new_dictionary)
#split the data of 10 rows into 80% train set and 20% test set
new_X_train, new_X_test, new_y_train, new_y_test = train_test_split(new_df[['X']],new_df.y,test_size=0.2, random_state=5)
#fit to find the best fit
lr.fit(new_X_train, new_y_train)
#print slope
print(lr.coef_[0])
#convert into numpy arrays as scikit learn works on top of numpy array
new_x = np.array(new_x)
new_y = np.array(new_y)
#reshape x as we cannot use Rank 1 matrix in scikit learn
new_x = new_x.reshape(len(new_x),1)
#fit to determine slope and intercept by passing all input and output data
lr.fit(new_x,new_y)
print('Slope with Linear Regression SciKit Learn: ', lr.coef_[0])
print('Intercept with Linear Regression SciKit Learn: ',lr.intercept_)
#new input data which is represented in excel
new_x = [2,3,4,5,6,7,8,9,10,11]
#new output data which is represented in excel
new_y = [10,12,20,22,21,25,30,21,32,34]
#plot the graph
plt.scatter(new_x, new_y)
plt.ylabel('New Dependent Variable')
plt.xlabel('New Independent Variable')
plt.show()
#see plot Input and Output Scatter Plot before Best Fit
#slope -logic
m = lr.coef_[0]
#constant
c = lr.intercept_
#multiply m with all input data to form a regression line- linear equation to observe whether it is the best fit or not.
regression_line = [(m*i)+c for i in x]
#scatter plot whole x and y data.
plt.scatter(x,y,color='blue')
plt.plot(x,regression_line, color='red')
plt.ylabel('Dependent/Output Variable')
#logic is nothing but coefficient to inputs which gives our output
print(lr.coef_)
#output: [ 11.]
#intercept is the error/constant between input and output variables
print(lr.intercept_)
#output: -2.84217094304e-14 which is -0.0000000000000284217094304 in decimal notation
#predict by providing new input as -18
print(lr.predict(-18))
#output: [-198.]
#-18 * 11 = -198
#predict with 89 as input
print(lr.predict(89))
#import metrics module from sklear
from sklearn import metrics
print(metrics.r2_score(y_test, y_prediction)*100)
#output: 100.0
@Manikant92
Manikant92 / test.py
Last active September 4, 2018 09:42
#X_test is our test set 5 rows-30% percent split. Pass our test data to predict method of algorithm.
y_prediction = lr.predict(X_test)
#print out the results of predictions done by our model.
print(y_prediction)
#output: [ 154. 33. 110. 55. 22.]