Created
June 28, 2018 15:25
-
-
Save aleszu/106fd985b0933d3e41694418ba7f2550 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(ggplot2) | |
dataset <- read.csv("winemag-data.csv", | |
header=TRUE, stringsAsFactors=FALSE) | |
dataset %>% glimpse(102) | |
# Quick view and descriptive stats | |
min(dataset$points) | |
max(dataset$points) | |
mean(dataset$points) | |
sd(dataset$points) | |
quantile(dataset$points, na.rm = TRUE) | |
min(dataset$price, na.rm = TRUE) | |
max(dataset$price, na.rm = TRUE) | |
mean(dataset$price, na.rm = TRUE) | |
sd(dataset$price, na.rm = TRUE) | |
quantile(dataset$price, na.rm = TRUE) | |
# Histogram of points | |
ggplot(dataset, aes(points)) + geom_histogram(stat = "count") | |
# Histogram of prices below $100 | |
ggplot(subset(dataset, price <= 100), aes(price)) + | |
geom_histogram(stat = "count") | |
# Scatter plot | |
ggplot(subset(dataset, price <= 100), aes(x=points, y = price)) + | |
geom_point() + | |
geom_count() + | |
geom_smooth(method='lm') + | |
theme(legend.position = "none") + | |
facet_wrap(~country) | |
# Filter by keyword | |
dataset_filtered <- dataset %>% | |
filter(country == "France") | |
# Basic descriptive stats of subset | |
mean(dataset_filtered$points) | |
mean(dataset_filtered$price, na.rm = TRUE) | |
# Count most commonly used words in reviews | |
library(tidytext) | |
tokenized_comments <- dataset_filtered %>% | |
select(description, designation, points, price, province, region_1, variety, winery) %>% | |
unnest_tokens(word, description) %>% | |
anti_join(stop_words) %>% | |
filter(word != "wine") %>% # need to take out the word "wine" | |
group_by(province, word) %>% | |
tally() | |
tokenized_comments %>% glimpse() | |
# Plot top review words by province | |
tokenized_comments %>% | |
group_by(province) %>% | |
top_n(15) %>% | |
arrange(desc(n)) %>% | |
ggplot(aes(x = reorder(word, n), y = n, fill = factor(province))) + | |
geom_bar(stat = "identity") + | |
theme(legend.position = "none") + | |
facet_wrap(~ province, scales = "free") + | |
coord_flip() + | |
labs(x = "Top words", | |
y = "Frequency", | |
title = "Top words used in Wine Enthusiast reviews of French wines by province", | |
subtitle = "") | |
# Plot top 15 tf_idf words | |
library(stringr) | |
tf_idf_words <- tokenized_comments %>% | |
bind_tf_idf(word, province, n) %>% | |
arrange(desc(tf)) | |
tf_idf_words | |
tf_idf_words %>% | |
filter(!str_detect(word, "^8")) %>% | |
filter(!str_detect(word, "^9")) %>% | |
filter(word != "sample") %>% | |
filter(word != "barrel") %>% | |
filter(word != "bordeaux") %>% | |
filter(word != "loire") %>% | |
filter(word != "beaujolais") %>% | |
filter(word != "champagne") %>% | |
filter(word != "provence") %>% | |
top_n(25) %>% | |
arrange(desc(tf_idf)) %>% | |
ggplot(aes(x = reorder(word, -tf_idf), y = tf_idf, fill = province)) + | |
geom_col() + | |
labs(x = NULL, y = "tf-idf") + | |
coord_flip() + | |
theme(legend.position = "none") + | |
facet_wrap(~ province, scales = "free") + | |
labs(x = "Top words", | |
y = "TF-IDF score", | |
title = "Words characteristic of French wines by province based on 150,000 Wine Enthusiast reviews", | |
subtitle = "") | |
# Map | |
library(maps) | |
map('france', col = 1:10) | |
francemap <- map_data('france') | |
francewines <- read.csv("wines-france.csv", | |
header=TRUE, stringsAsFactors=FALSE) | |
francewines %>% glimpse(102) | |
ggplot() + geom_polygon(data=francemap, | |
aes(x=long, y=lat, group = group), | |
color="white", fill="grey92") | |
+ theme_void() | |
counts <- francewines %>% | |
group_by(province) %>% | |
summarise(lat = lat, lon = lon, count = n(), | |
mean_price = mean(price, na.rm = TRUE), | |
mean_points = mean(points)) | |
counts %>% glimpse() | |
ggplot() + geom_polygon(data=francemap, aes(x=long, y=lat, group = group),color="white", fill="grey92" ) + | |
geom_point(data=counts, aes(x=lon, y=lat, size = mean_points), color="lightblue") + | |
scale_size(name="Average points", range = c(1, 15)) + | |
geom_text(data = counts, aes(x = lon, y = lat, label = province), vjust = 0.3, hjust = -0.3, check_overlap = TRUE) + | |
theme_void() | |
ggplot() + geom_polygon(data=francemap, aes(x=long, y=lat, group = group),color="white", fill="grey92" ) + | |
geom_point(data=counts, aes(x=lon, y=lat, size = mean_price), color="firebrick3") + | |
scale_size(name="Average price", range = c(1, 15)) + | |
geom_text(data = counts, aes(x = lon, y = lat, label = province), vjust = 0.3, hjust = -0.3, check_overlap = TRUE) + | |
theme_void() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment