---
title: "Wine Ratings"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r}
library(tidyverse)
theme_set(theme_light())
wine_ratings <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-05-28/winemag-data-130k-v2.csv") %>%
select(-X1) %>%
extract(title, "year", "(20\\d\\d)", convert = TRUE, remove = FALSE) %>%
mutate(year = ifelse(year < 1900, NA, year)) %>%
filter(!is.na(price))
```
```{r}
wine_ratings %>%
count(country, sort = T)
wine_ratings %>%
count(designation, sort = T)
wine_ratings %>%
count(country, region_1, sort = TRUE)
wine_ratings %>%
count(taster_name, sort = TRUE)
wine_ratings %>%
filter(!is.na(designation)) %>%
count(variety, designation, sort = TRUE)
wine_ratings %>%
ggplot(aes(year)) +
geom_histogram()
wine_ratings %>%
ggplot(aes(points)) +
geom_histogram(binwidth = 1)
wine_ratings %>%
ggplot(aes(price)) +
geom_histogram() +
scale_x_log10()
```
```{r}
ggplot(wine_ratings, aes(price, points)) +
geom_point(alpha = .1) +
geom_smooth(method = "lm") +
scale_x_log10()
summary(lm(points ~ log2(price), wine_ratings))
```
Every time the price doubles, the expected number of points goes up by 2.
```{r}
library(broom)
model <- wine_ratings %>%
replace_na(list(taster_name = "Missing", country = "Missing")) %>%
mutate(country = fct_relevel(fct_lump(country, 7), "US"),
taster_name = fct_relevel(fct_lump(taster_name, 6), "Missing")) %>%
lm(points ~ log2(price) + country + year + taster_name, data = .)
model %>%
tidy(conf.int = TRUE) %>%
filter(term != "(Intercept)") %>%
mutate(term = str_replace(term, "country", "Country: "),
term = str_replace(term, "taster_name", "Taster: "),
term = fct_reorder(term, estimate)) %>%
ggplot(aes(estimate, term)) +
geom_point() +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high))
```
```{r}
model %>%
augment(data = wine_ratings) %>%
ggplot(aes(.fitted, points)) +
geom_point(alpha = .1)
tidy(anova(model)) %>%
mutate(sumsq / sum(sumsq))
```
### Lasso regression on words in description
```{r}
library(tidytext)
wine_rating_words <- wine_ratings %>%
mutate(wine_id = row_number()) %>%
unnest_tokens(word, description) %>%
anti_join(stop_words, by = "word") %>%
filter(!word %in% c("wine", "drink"),
str_detect(word, "[a-z]"))
wine_rating_words %>%
count(word, sort = TRUE) %>%
head(20) %>%
mutate(word = fct_reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
coord_flip()
```
```{r}
library(widyr)
wine_words_filtered <- wine_rating_words %>%
distinct(wine_id, word) %>%
add_count(word) %>%
filter(n >= 100)
wine_words_filtered %>%
pairwise_cor(word, wine_id, sort = TRUE)
```
```{r}
library(Matrix)
wine_word_matrix <- wine_words_filtered %>%
cast_sparse(wine_id, word)
wine_ids <- as.integer(rownames(wine_word_matrix))
scores <- wine_ratings$points[wine_ids]
library(glmnet)
wine_word_matrix_extra <- cbind(wine_word_matrix, log_price = log2(wine_ratings$price[wine_ids]))
library(doMC)
registerDoMC(cores = 4)
cv_glmnet_model <- cv.glmnet(wine_word_matrix_extra, scores, parallel = TRUE)
plot(cv_glmnet_model)
```
```{r}
lexicon <- cv_glmnet_model$glmnet.fit %>%
tidy() %>%
filter(lambda == cv_glmnet_model$lambda.1se,
term != "(Intercept)",
term != "log_price") %>%
select(word = term, coefficient = estimate)
lexicon %>%
arrange(coefficient) %>%
group_by(direction = ifelse(coefficient < 0, "Negative", "Positive")) %>%
top_n(16, abs(coefficient)) %>%
ungroup() %>%
mutate(word = fct_reorder(word, coefficient)) %>%
ggplot(aes(word, coefficient, fill = direction)) +
geom_col() +
coord_flip() +
labs(x = "",
y = "Estimated effect of the word on the score",
title = "What words are predictive of a wine's score?")
```
```{r}
wine_ratings %>%
mutate(wine_id = row_number()) %>%
arrange(points) %>%
head(1) %>%
select(wine_id, description) %>%
pull(description)
wine_rating_words %>%
filter(wine_id %in% sample(unique(wine_id), 6)) %>%
distinct(word, title, points) %>%
mutate(wine = paste0(str_trunc(title, 40), " (", points, ")")) %>%
inner_join(lexicon, by = "word") %>%
mutate(word = fct_reorder(word, coefficient)) %>%
ggplot(aes(word, coefficient, fill = coefficient > 0)) +
geom_col(show.legend = FALSE) +
coord_flip() +
facet_wrap(~ wine, scales = "free_y") +
labs(title = "How a lasso regression would predict each wine's score",
subtitle = "Using a lasso regression with an extra term for price",
x = "",
y = "Effect on score")
```
### What is glmnet?
```{r}
cv_glmnet_model$glmnet.fit %>%
tidy() %>%
filter(term %in% c("rich", "black", "simple", "complex", "vineyard", "concentrated")) %>%
ggplot(aes(lambda, estimate, color = term)) +
geom_line() +
scale_x_log10() +
geom_hline(lty = 2, yintercept = 0)
cv_glmnet_model$glmnet.fit %>%
tidy() %>%
count(lambda) %>%
ggplot(aes(lambda, n)) +
geom_line() +
scale_x_log10()
```
```{r}
wine_ratings %>%
mutate(country = fct_relevel(fct_lump(country, 7), "US")) %>%
mutate(country = fct_reorder(country, points)) %>%
ggplot(aes(country, points)) +
geom_boxplot() +
coord_flip()
wine_ratings %>%
group_by(year) %>%
summarize(average_points = mean(points), n())
wine_ratings %>%
mutate(reviewer = fct_reorder(fct_lump(taster_name, 10), points)) %>%
ggplot(aes(reviewer, points)) +
geom_boxplot() +
coord_flip()
```