---
title: "Wine Ratings"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r}
library(tidyverse)
theme_set(theme_light())

wine_ratings <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-05-28/winemag-data-130k-v2.csv") %>%
  select(-X1) %>%
  extract(title, "year", "(20\\d\\d)", convert = TRUE, remove = FALSE) %>%
  mutate(year = ifelse(year < 1900, NA, year)) %>%
  filter(!is.na(price))
```

```{r}
wine_ratings %>%
  count(country, sort = T)

wine_ratings %>%
  count(designation, sort = T)

wine_ratings %>%
  count(country, region_1, sort = TRUE)

wine_ratings %>%
  count(taster_name, sort = TRUE)

wine_ratings %>%
  filter(!is.na(designation)) %>%
  count(variety, designation, sort = TRUE)

wine_ratings %>%
  ggplot(aes(year)) +
  geom_histogram()

wine_ratings %>%
  ggplot(aes(points)) +
  geom_histogram(binwidth = 1)

wine_ratings %>%
  ggplot(aes(price)) +
  geom_histogram() +
  scale_x_log10()
```

```{r}
ggplot(wine_ratings, aes(price, points)) +
  geom_point(alpha = .1) +
  geom_smooth(method = "lm") +
  scale_x_log10()

summary(lm(points ~ log2(price), wine_ratings))
```

Every time the price doubles, the expected number of points goes up by 2.

```{r}
library(broom)

model <- wine_ratings %>%
  replace_na(list(taster_name = "Missing", country = "Missing")) %>%
  mutate(country = fct_relevel(fct_lump(country, 7), "US"),
         taster_name = fct_relevel(fct_lump(taster_name, 6), "Missing")) %>%
  lm(points ~ log2(price) + country + year + taster_name, data = .)

model %>%
  tidy(conf.int = TRUE) %>%
  filter(term != "(Intercept)") %>%
  mutate(term = str_replace(term, "country", "Country: "),
         term = str_replace(term, "taster_name", "Taster: "),
         term = fct_reorder(term, estimate)) %>%
  ggplot(aes(estimate, term)) +
  geom_point() +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high))
```

```{r}
model %>%
  augment(data = wine_ratings) %>%
  ggplot(aes(.fitted, points)) +
  geom_point(alpha = .1)

tidy(anova(model)) %>%
  mutate(sumsq / sum(sumsq))
```

### Lasso regression on words in description

```{r}
library(tidytext)

wine_rating_words <- wine_ratings %>%
  mutate(wine_id = row_number()) %>%
  unnest_tokens(word, description) %>%
  anti_join(stop_words, by = "word") %>%
  filter(!word %in% c("wine", "drink"),
         str_detect(word, "[a-z]"))

wine_rating_words %>%
  count(word, sort = TRUE) %>%
  head(20) %>%
  mutate(word = fct_reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  coord_flip()
```

```{r}
library(widyr)

wine_words_filtered <- wine_rating_words %>%
  distinct(wine_id, word) %>%
  add_count(word) %>%
  filter(n >= 100)

wine_words_filtered %>%
  pairwise_cor(word, wine_id, sort = TRUE)
```

```{r}
library(Matrix)

wine_word_matrix <- wine_words_filtered %>%
  cast_sparse(wine_id, word)

wine_ids <- as.integer(rownames(wine_word_matrix))
scores <- wine_ratings$points[wine_ids]

library(glmnet)

wine_word_matrix_extra <- cbind(wine_word_matrix, log_price = log2(wine_ratings$price[wine_ids]))

library(doMC)
registerDoMC(cores = 4)

cv_glmnet_model <- cv.glmnet(wine_word_matrix_extra, scores, parallel = TRUE)

plot(cv_glmnet_model)
```

```{r}
lexicon <- cv_glmnet_model$glmnet.fit %>%
  tidy() %>%
  filter(lambda == cv_glmnet_model$lambda.1se,
         term != "(Intercept)",
         term != "log_price") %>%
  select(word = term, coefficient = estimate)

lexicon %>%
  arrange(coefficient) %>%
  group_by(direction = ifelse(coefficient < 0, "Negative", "Positive")) %>%
  top_n(16, abs(coefficient)) %>%
  ungroup() %>%
  mutate(word = fct_reorder(word, coefficient)) %>%
  ggplot(aes(word, coefficient, fill = direction)) +
  geom_col() +
  coord_flip() +
  labs(x = "",
       y = "Estimated effect of the word on the score",
       title = "What words are predictive of a wine's score?")
```

```{r}
wine_ratings %>%
  mutate(wine_id = row_number()) %>%
  arrange(points) %>%
  head(1) %>%
  select(wine_id, description) %>%
  pull(description)

wine_rating_words %>%
  filter(wine_id %in% sample(unique(wine_id), 6)) %>%
  distinct(word, title, points) %>%
  mutate(wine = paste0(str_trunc(title, 40), " (", points, ")")) %>%
  inner_join(lexicon, by = "word") %>%
  mutate(word = fct_reorder(word, coefficient)) %>%
  ggplot(aes(word, coefficient, fill = coefficient > 0)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~ wine, scales = "free_y") +
  labs(title = "How a lasso regression would predict each wine's score",
       subtitle = "Using a lasso regression with an extra term for price",
       x = "",
       y = "Effect on score")
```


### What is glmnet?

```{r}
cv_glmnet_model$glmnet.fit %>%
  tidy() %>%
  filter(term %in% c("rich", "black", "simple", "complex", "vineyard", "concentrated")) %>%
  ggplot(aes(lambda, estimate, color = term)) +
  geom_line() +
  scale_x_log10() +
  geom_hline(lty = 2, yintercept = 0)

cv_glmnet_model$glmnet.fit %>%
  tidy() %>%
  count(lambda) %>%
  ggplot(aes(lambda, n)) +
  geom_line() +
  scale_x_log10()
```


```{r}
wine_ratings %>%
  mutate(country = fct_relevel(fct_lump(country, 7), "US")) %>%
  mutate(country = fct_reorder(country, points)) %>%
  ggplot(aes(country, points)) +
  geom_boxplot() +
  coord_flip()

wine_ratings %>%
  group_by(year) %>%
  summarize(average_points = mean(points), n())
  
wine_ratings %>%
  mutate(reviewer = fct_reorder(fct_lump(taster_name, 10), points)) %>%
  ggplot(aes(reviewer, points)) +
  geom_boxplot() +
  coord_flip()
```