wesslen · May 19, 2019 19:09
diff --git a/cfpb-complaints.Rmd b/cfpb-complaints.Rmd
 ---
 name: "Ryan Wesslen"
 title: "cfpb complaints/reticulate"
 output: html_document
 ---

 ```{r setup, include=FALSE}
 knitr::opts_chunk$set(echo = TRUE)
 library(reticulate); library(tidyverse)
 ```

 ## R Markdown

 Let's use pandas to load the original data.

 ```{python data}
 import pandas as pd

 data = pd.read_csv('complaints.csv', engine='python')
 data.head(n=5)
 ```

 Let's keep only three of the products.

 ```{python product-clean}
 products = ['Credit card','Checking or savings account','Mortgage']

 df = data.loc[data["Product"].isin(products)]
 ```

 Let's now remove rare sub-products 

 ```{python sub-product}
 subproducts = ['CD (Certificate of Deposit)','Personal line of credit','Reverse mortgage','VA mortgage']

 df = df.loc[~data["Sub-product"].isin(subproducts)]

 h = df.groupby(['Sub-product'])['Sub-product'].count()

 print(h)
 ```

 ```{python sub-product2}
 df.loc[df['Sub-product'] == "Home equity loan or line of credit (HELOC)", 'Sub-product'] = "Home equity loan or line of credit"
 df.loc[df['Sub-product'] == "Other type of mortgage", 'Sub-product'] = "Other mortgage"

 # give 
 df.loc[df['Sub-product'] == "None", 'Sub-product'] = "Other banking product or service"
 df.loc[df['Product'] == "Credit card", 'Sub-product'] = "Credit card"
 ```

 # Product-Issue Hiearchcy

 ```{r}
 library(collapsibleTree)

 library(RColorBrewer)

 py$df %>%
  count(Product, `Sub-product`, Issue, `Sub-issue`) %>%
  collapsibleTree(
    c("Product", "Sub-product", "Issue", "Sub-issue"),
    root = "Total",
    tooltip = TRUE,
    attribute = "n",
    nodeSize = 'leafCount',
    width = 800, 
    zoomable = FALSE
  )
 ```


 ```{python}
 h = df.groupby(['Issue'])['Issue'].count()

 print(h)
 ```


 ```{r}
 py$df %>%
  write_csv(t)
 ```


 ```{r eval=FALSE}
 py$df %>%
  count(Product) %>%
  arrange(desc(n)) %>%
  ggplot(aes(x = forcats::fct_reorder(Product, count), y = count)) +
  geom_col() +
  coord_flip() +
  ggthemes::theme_economist() +
  labs(x = " ", y = "Number of Complaints", title = "CFPB Complaints by product")
 ```


 ## Product Only

 ```{python partition, echo=FALSE, eval=FALSE}
 from sklearn.model_selection import train_test_split

 X_train, X_test, y_train, y_test = train_test_split(df['Consumer complaint narrative'], df['Product'], random_state = 0)
 ```

 ```{python fit-tsne}
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.pipeline import Pipeline
 from sklearn.manifold import TSNE
 from sklearn.decomposition import TruncatedSVD

 text_tsne = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svd', TruncatedSVD(n_components=50)),
    ('tsne', TSNE(n_components=2)),
 ])

 # change these!
 points = text_tsne.fit_transform(df['Consumer complaint narrative'])
 targets = df['Product']
 tsne_df = pd.DataFrame({'x':points[:,0],'y':points[:,1],'label':targets})
 ```


 ## Run R on it

 ```{r}
 library(tidyverse); library(rsample)

 split <- py$tsne_df %>%
  mutate(label = as.factor(label)) %>%
  initial_split()
 train_data <- training(split)
 test_data <- testing(split)

 train_data %>%
  ggplot(aes(x = x, y = y)) +
  geom_point(aes(fill = label), color = "black", size = 0.7, shape = 21) +
  theme_bw() +
  theme(legend.position = "bottom")
 ```








 ```{r}
 # Learn model
 rf = randomForest::randomForest(label ~ x + y, data = train_data, ntree=100)
 train_data$predicted = predict(rf, train_data)
 test_data$predicted = predict(rf, test_data)
 ```

 ```{r}
 # Define range of set
 lower_x1 = -90
 upper_x1 = 90
 lower_x2 = -90
 upper_x2 = 90
 n_grid = 100
 # The decision boundaries
 grid_x1 = seq(from=lower_x1, to=upper_x1, length.out=n_grid)
 grid_x2 = seq(from=lower_x2, to=upper_x2, length.out=n_grid)
 grid_df = expand.grid(x = grid_x1, y = grid_x2)
 grid_df$predicted = as.character(predict(rf, newdata = grid_df))

 prob <- as.data.frame(predict(rf, newdata = grid_df, type = "prob"))
 grid_df$prob <- apply(prob, 1, FUN=max)
 ```

 ```{r}
 library(yardstick)

 accuracy(test_data, label, predicted)
 ```

 ```{r}
 conf_mat(test_data, label, predicted)
 ```

 ```{r}
 bal_accuracy(test_data, label, predicted)
 ```

 ```{r}
 ggplot(grid_df, aes(x = x, y = y, fill = predicted)) +
  #geom_hex(alpha=0.4) +
  geom_raster(aes(alpha = prob), interpolate=TRUE) + #alpha = 0.3 to be fixed pred
  geom_point(data = test_data, aes(fill = label), color = "black", shape=21, size = 0.5) +
  theme(legend.position = "right") +
  labs(x = "x1", y = "x2", title = "Random Forest on t-SNE reduced")
 ```
	---
	name: "Ryan Wesslen"
	title: "cfpb complaints/reticulate"
	output: html_document
	---

	```{r setup, include=FALSE}
	knitr::opts_chunk$set(echo = TRUE)
	library(reticulate); library(tidyverse)
	```

	## R Markdown

	Let's use pandas to load the original data.

	```{python data}
	import pandas as pd

	data = pd.read_csv('complaints.csv', engine='python')
	data.head(n=5)
	```

	Let's keep only three of the products.

	```{python product-clean}
	products = ['Credit card','Checking or savings account','Mortgage']

	df = data.loc[data["Product"].isin(products)]
	```

	Let's now remove rare sub-products

	```{python sub-product}
	subproducts = ['CD (Certificate of Deposit)','Personal line of credit','Reverse mortgage','VA mortgage']

	df = df.loc[~data["Sub-product"].isin(subproducts)]

	h = df.groupby(['Sub-product'])['Sub-product'].count()

	print(h)
	```

	```{python sub-product2}
	df.loc[df['Sub-product'] == "Home equity loan or line of credit (HELOC)", 'Sub-product'] = "Home equity loan or line of credit"
	df.loc[df['Sub-product'] == "Other type of mortgage", 'Sub-product'] = "Other mortgage"

	# give
	df.loc[df['Sub-product'] == "None", 'Sub-product'] = "Other banking product or service"
	df.loc[df['Product'] == "Credit card", 'Sub-product'] = "Credit card"
	```

	# Product-Issue Hiearchcy

	```{r}
	library(collapsibleTree)

	library(RColorBrewer)

	py$df %>%
	count(Product, `Sub-product`, Issue, `Sub-issue`) %>%
	collapsibleTree(
	c("Product", "Sub-product", "Issue", "Sub-issue"),
	root = "Total",
	tooltip = TRUE,
	attribute = "n",
	nodeSize = 'leafCount',
	width = 800,
	zoomable = FALSE
	)
	```


	```{python}
	h = df.groupby(['Issue'])['Issue'].count()

	print(h)
	```


	```{r}
	py$df %>%
	write_csv(t)
	```


	```{r eval=FALSE}
	py$df %>%
	count(Product) %>%
	arrange(desc(n)) %>%
	ggplot(aes(x = forcats::fct_reorder(Product, count), y = count)) +
	geom_col() +
	coord_flip() +
	ggthemes::theme_economist() +
	labs(x = " ", y = "Number of Complaints", title = "CFPB Complaints by product")
	```


	## Product Only

	```{python partition, echo=FALSE, eval=FALSE}
	from sklearn.model_selection import train_test_split

	X_train, X_test, y_train, y_test = train_test_split(df['Consumer complaint narrative'], df['Product'], random_state = 0)
	```

	```{python fit-tsne}
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.pipeline import Pipeline
	from sklearn.manifold import TSNE
	from sklearn.decomposition import TruncatedSVD

	text_tsne = Pipeline([
	('tfidf', TfidfVectorizer()),
	('svd', TruncatedSVD(n_components=50)),
	('tsne', TSNE(n_components=2)),
	])

	# change these!
	points = text_tsne.fit_transform(df['Consumer complaint narrative'])
	targets = df['Product']
	tsne_df = pd.DataFrame({'x':points[:,0],'y':points[:,1],'label':targets})
	```


	## Run R on it

	```{r}
	library(tidyverse); library(rsample)

	split <- py$tsne_df %>%
	mutate(label = as.factor(label)) %>%
	initial_split()
	train_data <- training(split)
	test_data <- testing(split)

	train_data %>%
	ggplot(aes(x = x, y = y)) +
	geom_point(aes(fill = label), color = "black", size = 0.7, shape = 21) +
	theme_bw() +
	theme(legend.position = "bottom")
	```








	```{r}
	# Learn model
	rf = randomForest::randomForest(label ~ x + y, data = train_data, ntree=100)
	train_data$predicted = predict(rf, train_data)
	test_data$predicted = predict(rf, test_data)
	```

	```{r}
	# Define range of set
	lower_x1 = -90
	upper_x1 = 90
	lower_x2 = -90
	upper_x2 = 90
	n_grid = 100
	# The decision boundaries
	grid_x1 = seq(from=lower_x1, to=upper_x1, length.out=n_grid)
	grid_x2 = seq(from=lower_x2, to=upper_x2, length.out=n_grid)
	grid_df = expand.grid(x = grid_x1, y = grid_x2)
	grid_df$predicted = as.character(predict(rf, newdata = grid_df))

	prob <- as.data.frame(predict(rf, newdata = grid_df, type = "prob"))
	grid_df$prob <- apply(prob, 1, FUN=max)
	```

	```{r}
	library(yardstick)

	accuracy(test_data, label, predicted)
	```

	```{r}
	conf_mat(test_data, label, predicted)
	```

	```{r}
	bal_accuracy(test_data, label, predicted)
	```

	```{r}
	ggplot(grid_df, aes(x = x, y = y, fill = predicted)) +
	#geom_hex(alpha=0.4) +
	geom_raster(aes(alpha = prob), interpolate=TRUE) + #alpha = 0.3 to be fixed pred
	geom_point(data = test_data, aes(fill = label), color = "black", shape=21, size = 0.5) +
	theme(legend.position = "right") +
	labs(x = "x1", y = "x2", title = "Random Forest on t-SNE reduced")
	```
No results found