Created
May 19, 2019 19:09
-
-
Save wesslen/c3ec04fe6bd61e6bc7c8b446aab903a1 to your computer and use it in GitHub Desktop.
cfpb complaints - reticulate
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| --- | |
| name: "Ryan Wesslen" | |
| title: "cfpb complaints/reticulate" | |
| output: html_document | |
| --- | |
| ```{r setup, include=FALSE} | |
| knitr::opts_chunk$set(echo = TRUE) | |
| library(reticulate); library(tidyverse) | |
| ``` | |
| ## R Markdown | |
| Let's use pandas to load the original data. | |
| ```{python data} | |
| import pandas as pd | |
| data = pd.read_csv('complaints.csv', engine='python') | |
| data.head(n=5) | |
| ``` | |
| Let's keep only three of the products. | |
| ```{python product-clean} | |
| products = ['Credit card','Checking or savings account','Mortgage'] | |
| df = data.loc[data["Product"].isin(products)] | |
| ``` | |
| Let's now remove rare sub-products | |
| ```{python sub-product} | |
| subproducts = ['CD (Certificate of Deposit)','Personal line of credit','Reverse mortgage','VA mortgage'] | |
| df = df.loc[~data["Sub-product"].isin(subproducts)] | |
| h = df.groupby(['Sub-product'])['Sub-product'].count() | |
| print(h) | |
| ``` | |
| ```{python sub-product2} | |
| df.loc[df['Sub-product'] == "Home equity loan or line of credit (HELOC)", 'Sub-product'] = "Home equity loan or line of credit" | |
| df.loc[df['Sub-product'] == "Other type of mortgage", 'Sub-product'] = "Other mortgage" | |
| # give | |
| df.loc[df['Sub-product'] == "None", 'Sub-product'] = "Other banking product or service" | |
| df.loc[df['Product'] == "Credit card", 'Sub-product'] = "Credit card" | |
| ``` | |
| # Product-Issue Hiearchcy | |
| ```{r} | |
| library(collapsibleTree) | |
| library(RColorBrewer) | |
| py$df %>% | |
| count(Product, `Sub-product`, Issue, `Sub-issue`) %>% | |
| collapsibleTree( | |
| c("Product", "Sub-product", "Issue", "Sub-issue"), | |
| root = "Total", | |
| tooltip = TRUE, | |
| attribute = "n", | |
| nodeSize = 'leafCount', | |
| width = 800, | |
| zoomable = FALSE | |
| ) | |
| ``` | |
| ```{python} | |
| h = df.groupby(['Issue'])['Issue'].count() | |
| print(h) | |
| ``` | |
| ```{r} | |
| py$df %>% | |
| write_csv(t) | |
| ``` | |
| ```{r eval=FALSE} | |
| py$df %>% | |
| count(Product) %>% | |
| arrange(desc(n)) %>% | |
| ggplot(aes(x = forcats::fct_reorder(Product, count), y = count)) + | |
| geom_col() + | |
| coord_flip() + | |
| ggthemes::theme_economist() + | |
| labs(x = " ", y = "Number of Complaints", title = "CFPB Complaints by product") | |
| ``` | |
| ## Product Only | |
| ```{python partition, echo=FALSE, eval=FALSE} | |
| from sklearn.model_selection import train_test_split | |
| X_train, X_test, y_train, y_test = train_test_split(df['Consumer complaint narrative'], df['Product'], random_state = 0) | |
| ``` | |
| ```{python fit-tsne} | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.manifold import TSNE | |
| from sklearn.decomposition import TruncatedSVD | |
| text_tsne = Pipeline([ | |
| ('tfidf', TfidfVectorizer()), | |
| ('svd', TruncatedSVD(n_components=50)), | |
| ('tsne', TSNE(n_components=2)), | |
| ]) | |
| # change these! | |
| points = text_tsne.fit_transform(df['Consumer complaint narrative']) | |
| targets = df['Product'] | |
| tsne_df = pd.DataFrame({'x':points[:,0],'y':points[:,1],'label':targets}) | |
| ``` | |
| ## Run R on it | |
| ```{r} | |
| library(tidyverse); library(rsample) | |
| split <- py$tsne_df %>% | |
| mutate(label = as.factor(label)) %>% | |
| initial_split() | |
| train_data <- training(split) | |
| test_data <- testing(split) | |
| train_data %>% | |
| ggplot(aes(x = x, y = y)) + | |
| geom_point(aes(fill = label), color = "black", size = 0.7, shape = 21) + | |
| theme_bw() + | |
| theme(legend.position = "bottom") | |
| ``` | |
| ```{r} | |
| # Learn model | |
| rf = randomForest::randomForest(label ~ x + y, data = train_data, ntree=100) | |
| train_data$predicted = predict(rf, train_data) | |
| test_data$predicted = predict(rf, test_data) | |
| ``` | |
| ```{r} | |
| # Define range of set | |
| lower_x1 = -90 | |
| upper_x1 = 90 | |
| lower_x2 = -90 | |
| upper_x2 = 90 | |
| n_grid = 100 | |
| # The decision boundaries | |
| grid_x1 = seq(from=lower_x1, to=upper_x1, length.out=n_grid) | |
| grid_x2 = seq(from=lower_x2, to=upper_x2, length.out=n_grid) | |
| grid_df = expand.grid(x = grid_x1, y = grid_x2) | |
| grid_df$predicted = as.character(predict(rf, newdata = grid_df)) | |
| prob <- as.data.frame(predict(rf, newdata = grid_df, type = "prob")) | |
| grid_df$prob <- apply(prob, 1, FUN=max) | |
| ``` | |
| ```{r} | |
| library(yardstick) | |
| accuracy(test_data, label, predicted) | |
| ``` | |
| ```{r} | |
| conf_mat(test_data, label, predicted) | |
| ``` | |
| ```{r} | |
| bal_accuracy(test_data, label, predicted) | |
| ``` | |
| ```{r} | |
| ggplot(grid_df, aes(x = x, y = y, fill = predicted)) + | |
| #geom_hex(alpha=0.4) + | |
| geom_raster(aes(alpha = prob), interpolate=TRUE) + #alpha = 0.3 to be fixed pred | |
| geom_point(data = test_data, aes(fill = label), color = "black", shape=21, size = 0.5) + | |
| theme(legend.position = "right") + | |
| labs(x = "x1", y = "x2", title = "Random Forest on t-SNE reduced") | |
| ``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment