Created
May 9, 2019 17:47
-
-
Save ortsed/94b8ab307f56500d2c8ec3558dde3b17 to your computer and use it in GitHub Desktop.
Python Stats Cheat Sheet
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Cheat Sheet | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| # Pandas | |
| pd.to_numeric(df, coalesce="") | |
| pd.to_datetime() | |
| pd.to_timestamp() | |
| pd.to_timedelta() | |
| pd.date_range() | |
| pd.cut() | |
| pd.get_dummies(series) | |
| Observations | |
| df.head() | |
| df.tail() | |
| df.filter() | |
| df.nlargest() | |
| df.nsmallest() | |
| pd.isnull() | |
| pd.notnull() | |
| df.column.isin(values) | |
| df.any() | |
| df.all() | |
| df.shape (rows,columns) | |
| df.index | |
| df.columns | |
| df.info() | |
| df.count() | |
| df.describe() | |
| df.apply(lambda row: ) | |
| df.applymap() | |
| df.column.map(lambda cell: ) | |
| # Cleaning | |
| df.drop_duplicates() | |
| df.duplicate() | |
| series.fillna(value=values) | |
| df.sort_index() | |
| df.reset_index() | |
| df.reindex() | |
| df.drop() | |
| df.rename() | |
| df.sort_values() | |
| df.pivot(columns="") | |
| pd.concat([df, df]) | |
| pd.merge(df,df) | |
| pd.melt(df) | |
| df.groupby() | |
| df.size() | |
| df.agg() | |
| # Group functions | |
| .shift() | |
| .rank() | |
| Categories | |
| series.astype('category') | |
| series.cat.as_unordered() | |
| series.cat.codes | |
| encoder = LabelEncoder() | |
| encoder.fit_transform(series) | |
| from sklearn.preprocessing import LabelBinarizer | |
| lb = LabelBinarizer() | |
| lb.fit_transform(cat_origin) | |
| # Dataframe plots | |
| df.hist() | |
| df.plot() | |
| df.plot.kde() | |
| Matplotlib | |
| plt.figure() | |
| plt.subplots() | |
| plt.subplot() | |
| Figure | |
| fig = plt.figure(figsize=(7,5)) | |
| fig.suptitle('seniority vs. income', fontsize=16) | |
| ax = fig.gca() | |
| plt.plot() | |
| plt.bar() | |
| plt.hist() | |
| pd.plotting.scatter_matrix() | |
| plt.xlabel() | |
| plt.xticks() | |
| # Seaborn | |
| import seaborn as sns | |
| sns.scatterplot() | |
| sns.regplot() | |
| sns.pairplot (data) | |
| # Sklearn | |
| from sklearn.model_selection import KFold | |
| kf = KFold(n_splits=2) | |
| from sklearn.metrics import mean_squared_error | |
| mean_squared_error(y_true, y_pred) | |
| from sklearn.model_selection import cross_val_score, cross_val_predict | |
| neg_mean_squared_error = cross_val_score(linreg, X.loc[test_index], y.loc[test_index]) | |
| y_predict = cross_val_predict(linreg, X_train, y_train,cv=5) | |
| mean_squared_error(y_train,y_predict) | |
| rmse = np.sqrt(mean_squared_error(y_train,y_predict) | |
| from sklearn.linear_model import LinearRegression | |
| linreg = LinearRegression() | |
| linreg.fit(predictors, y) | |
| linreg.coef_ | |
| linreg.intercept_ | |
| r_squared = linreg.score(X_train, y_train) | |
| r_adj = 1 - (1-r_squared)*(len(y)-1)/(len(y)-X.shape[1]-1) | |
| from sklearn.feature_selection import RFE | |
| from sklearn.linear_model import LinearRegression | |
| linreg = LinearRegression() | |
| rfe = RFE(linreg, n_features_to_select = 2).fit(predictors, series) | |
| rfe.ranking_ | |
| rfe.support_ | |
| rfe.estimator_.coef_ | |
| rfe.estimator_.intercept_) | |
| from sklearn.model_selection import train_test_split | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2) | |
| from sklearn.metrics import mean_squared_error | |
| mean_squared_error(y_train, y_hat_train) | |
| # Statistics | |
| # OLS | |
| from statsmodels.formula.api import ols | |
| model = ols(formula="weight~height", data=df).fit() | |
| model.summary() | |
| model.pvalues | |
| model.rsquared | |
| model.params #arrays (Intercept, slope) | |
| import statsmodels.api as sm | |
| import scipy.stats as sms | |
| sm.add_constant(predictors) | |
| # 4 charts of error and | |
| fig = sm.graphics.plot_regress_exog(model, "height", fig=fig) | |
| ## QQ test for normality of residuals | |
| import scipy.stats as stats | |
| fig = sm.graphics.qqplot(model.resid, dist=stats.norm, line='45', fit=True) | |
| # Jarque Bera test for normality | |
| (jarque_bera, prob, skew, kurtosis) = sms.jarque_bera(model.resid) | |
| #Goldfeld-Quandt test for homoscedasticity | |
| f_statistic, p_value = sms.het_goldfeldquandt(model.resid.iloc[indices], model.model.exog[indices]) | |
| #split and calculate | |
| lwr_thresh = data.TV.quantile(q=.45) | |
| upr_thresh = data.TV.quantile(q=.55) | |
| middle_10percent_indices = data[(data.value >= lwr_thresh) & (data.TV<=upr_thresh)].index | |
| indices = [x-1 for x in data.index if x not in middle_10percent_indices] | |
| pl | |
Author
I don't think there is a fixed method in statsmodels for that, but requires analysis and manual filtering.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
How to drop insignificant variables in statsmodel python (linear regression)