Skip to content

Instantly share code, notes, and snippets.

@ortsed
Created May 9, 2019 17:47
Show Gist options
  • Select an option

  • Save ortsed/94b8ab307f56500d2c8ec3558dde3b17 to your computer and use it in GitHub Desktop.

Select an option

Save ortsed/94b8ab307f56500d2c8ec3558dde3b17 to your computer and use it in GitHub Desktop.
Python Stats Cheat Sheet
# Cheat Sheet
import warnings
warnings.filterwarnings('ignore')
# Pandas
pd.to_numeric(df, coalesce="")
pd.to_datetime()
pd.to_timestamp()
pd.to_timedelta()
pd.date_range()
pd.cut()
pd.get_dummies(series)
Observations
df.head()
df.tail()
df.filter()
df.nlargest()
df.nsmallest()
pd.isnull()
pd.notnull()
df.column.isin(values)
df.any()
df.all()
df.shape (rows,columns)
df.index
df.columns
df.info()
df.count()
df.describe()
df.apply(lambda row: )
df.applymap()
df.column.map(lambda cell: )
# Cleaning
df.drop_duplicates()
df.duplicate()
series.fillna(value=values)
df.sort_index()
df.reset_index()
df.reindex()
df.drop()
df.rename()
df.sort_values()
df.pivot(columns="")
pd.concat([df, df])
pd.merge(df,df)
pd.melt(df)
df.groupby()
df.size()
df.agg()
# Group functions
.shift()
.rank()
Categories
series.astype('category')
series.cat.as_unordered()
series.cat.codes
encoder = LabelEncoder()
encoder.fit_transform(series)
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
lb.fit_transform(cat_origin)
# Dataframe plots
df.hist()
df.plot()
df.plot.kde()
Matplotlib
plt.figure()
plt.subplots()
plt.subplot()
Figure
fig = plt.figure(figsize=(7,5))
fig.suptitle('seniority vs. income', fontsize=16)
ax = fig.gca()
plt.plot()
plt.bar()
plt.hist()
pd.plotting.scatter_matrix()
plt.xlabel()
plt.xticks()
# Seaborn
import seaborn as sns
sns.scatterplot()
sns.regplot()
sns.pairplot (data)
# Sklearn
from sklearn.model_selection import KFold
kf = KFold(n_splits=2)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_true, y_pred)
from sklearn.model_selection import cross_val_score, cross_val_predict
neg_mean_squared_error = cross_val_score(linreg, X.loc[test_index], y.loc[test_index])
y_predict = cross_val_predict(linreg, X_train, y_train,cv=5)
mean_squared_error(y_train,y_predict)
rmse = np.sqrt(mean_squared_error(y_train,y_predict)
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(predictors, y)
linreg.coef_
linreg.intercept_
r_squared = linreg.score(X_train, y_train)
r_adj = 1 - (1-r_squared)*(len(y)-1)/(len(y)-X.shape[1]-1)
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
rfe = RFE(linreg, n_features_to_select = 2).fit(predictors, series)
rfe.ranking_
rfe.support_
rfe.estimator_.coef_
rfe.estimator_.intercept_)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_train, y_hat_train)
# Statistics
# OLS
from statsmodels.formula.api import ols
model = ols(formula="weight~height", data=df).fit()
model.summary()
model.pvalues
model.rsquared
model.params #arrays (Intercept, slope)
import statsmodels.api as sm
import scipy.stats as sms
sm.add_constant(predictors)
# 4 charts of error and
fig = sm.graphics.plot_regress_exog(model, "height", fig=fig)
## QQ test for normality of residuals
import scipy.stats as stats
fig = sm.graphics.qqplot(model.resid, dist=stats.norm, line='45', fit=True)
# Jarque Bera test for normality
(jarque_bera, prob, skew, kurtosis) = sms.jarque_bera(model.resid)
#Goldfeld-Quandt test for homoscedasticity
f_statistic, p_value = sms.het_goldfeldquandt(model.resid.iloc[indices], model.model.exog[indices])
#split and calculate
lwr_thresh = data.TV.quantile(q=.45)
upr_thresh = data.TV.quantile(q=.55)
middle_10percent_indices = data[(data.value >= lwr_thresh) & (data.TV<=upr_thresh)].index
indices = [x-1 for x in data.index if x not in middle_10percent_indices]
pl
Copy link
Copy Markdown

ghost commented Jul 8, 2021

How to drop insignificant variables in statsmodel python (linear regression)

@ortsed
Copy link
Copy Markdown
Author

ortsed commented Jul 8, 2021

I don't think there is a fixed method in statsmodels for that, but requires analysis and manual filtering.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment