Created June 28, 2018 15:25
dataset <- read.csv("winemag-data.csv",
header=TRUE, stringsAsFactors=FALSE)
dataset %>% glimpse(102)
# Quick view and descriptive stats
quantile(dataset$points, na.rm = TRUE)
min(dataset$price, na.rm = TRUE)
max(dataset$price, na.rm = TRUE)
mean(dataset$price, na.rm = TRUE)
sd(dataset$price, na.rm = TRUE)
quantile(dataset$price, na.rm = TRUE)
# Histogram of points
ggplot(dataset, aes(points)) + geom_histogram(stat = "count")
# Histogram of prices below $100
ggplot(subset(dataset, price <= 100), aes(price)) +
geom_histogram(stat = "count")
# Scatter plot
ggplot(subset(dataset, price <= 100), aes(x=points, y = price)) +
geom_point() +
geom_count() +
geom_smooth(method='lm') +
theme(legend.position = "none") +
# Filter by keyword
dataset_filtered <- dataset %>%
filter(country == "France")
# Basic descriptive stats of subset
mean(dataset_filtered$price, na.rm = TRUE)
# Count most commonly used words in reviews
tokenized_comments <- dataset_filtered %>%
select(description, designation, points, price, province, region_1, variety, winery) %>%
unnest_tokens(word, description) %>%
anti_join(stop_words) %>%
filter(word != "wine") %>% # need to take out the word "wine"
group_by(province, word) %>%
tokenized_comments %>% glimpse()
# Plot top review words by province
tokenized_comments %>%
group_by(province) %>%
top_n(15) %>%
arrange(desc(n)) %>%
ggplot(aes(x = reorder(word, n), y = n, fill = factor(province))) +
geom_bar(stat = "identity") +
theme(legend.position = "none") +
facet_wrap(~ province, scales = "free") +
coord_flip() +
labs(x = "Top words",
y = "Frequency",
title = "Top words used in Wine Enthusiast reviews of French wines by province",
subtitle = "")
# Plot top 15 tf_idf words
tf_idf_words <- tokenized_comments %>%
bind_tf_idf(word, province, n) %>%
tf_idf_words %>%
filter(!str_detect(word, "^8")) %>%
filter(!str_detect(word, "^9")) %>%
filter(word != "sample") %>%
filter(word != "barrel") %>%
filter(word != "bordeaux") %>%
filter(word != "loire") %>%
filter(word != "beaujolais") %>%
filter(word != "champagne") %>%
filter(word != "provence") %>%
top_n(25) %>%
arrange(desc(tf_idf)) %>%
ggplot(aes(x = reorder(word, -tf_idf), y = tf_idf, fill = province)) +
geom_col() +
labs(x = NULL, y = "tf-idf") +
coord_flip() +
theme(legend.position = "none") +
facet_wrap(~ province, scales = "free") +
labs(x = "Top words",
y = "TF-IDF score",
title = "Words characteristic of French wines by province based on 150,000 Wine Enthusiast reviews",
subtitle = "")
# Map
map('france', col = 1:10)
francemap <- map_data('france')
francewines <- read.csv("wines-france.csv",
header=TRUE, stringsAsFactors=FALSE)
francewines %>% glimpse(102)
ggplot() + geom_polygon(data=francemap,
aes(x=long, y=lat, group = group),
color="white", fill="grey92")
+ theme_void()
counts <- francewines %>%
group_by(province) %>%
summarise(lat = lat, lon = lon, count = n(),
mean_price = mean(price, na.rm = TRUE),
mean_points = mean(points))
counts %>% glimpse()
ggplot() + geom_polygon(data=francemap, aes(x=long, y=lat, group = group),color="white", fill="grey92" ) +
geom_point(data=counts, aes(x=lon, y=lat, size = mean_points), color="lightblue") +
scale_size(name="Average points", range = c(1, 15)) +
geom_text(data = counts, aes(x = lon, y = lat, label = province), vjust = 0.3, hjust = -0.3, check_overlap = TRUE) +
ggplot() + geom_polygon(data=francemap, aes(x=long, y=lat, group = group),color="white", fill="grey92" ) +
geom_point(data=counts, aes(x=lon, y=lat, size = mean_price), color="firebrick3") +
scale_size(name="Average price", range = c(1, 15)) +
geom_text(data = counts, aes(x = lon, y = lat, label = province), vjust = 0.3, hjust = -0.3, check_overlap = TRUE) +
