---
title: "NYC Restaurants"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r}
library(tidyverse)
library(lubridate)
theme_set(theme_light())
# You can use this url to download the data directly into R (will take a few seconds)
restaurant_inspections_raw <- read_csv("https://data.cityofnewyork.us/api/views/43nn-pn8j/rows.csv")
restaurant_inspections <- restaurant_inspections_raw %>%
janitor::clean_names() %>%
select(-phone, -grade_date, -record_date, -building, -street) %>%
mutate(inspection_date = mdy(inspection_date)) %>%
separate(inspection_type, c("inspection_program", "inspection_type"), sep = " / ")
```
```{r}
restaurant_inspections %>%
count(dba, camis, sort = TRUE)
restaurant_inspections %>%
count(year = year(inspection_date))
restaurant_inspections %>%
count(grade, sort = TRUE)
restaurant_inspections %>%
count(violation_code, violation_description, sort = TRUE)
restaurant_inspections %>%
filter(camis == 41297769, inspection_date == "2018-09-25") %>%
count(camis, dba, inspection_date, sort = TRUE)
restaurant_inspections %>%
count(cuisine_description, sort = TRUE)
restaurant_inspections %>%
filter(action == "No violations were recorded at the time of this inspection.") %>%
count(critical_flag)
inspections <- restaurant_inspections %>%
group_by(camis,
dba,
boro,
zipcode,
cuisine_description,
inspection_date,
action,
score,
grade,
inspection_type,
inspection_program) %>%
summarize(critical_violations = sum(critical_flag == "Critical", na.rm = TRUE),
non_critical_violations = sum(critical_flag == "Not Critical", na.rm = TRUE)) %>%
ungroup()
most_recent_cycle_inspection <- inspections %>%
filter(inspection_program == "Cycle Inspection",
inspection_type == "Initial Inspection") %>%
arrange(desc(inspection_date)) %>%
distinct(camis, .keep_all = TRUE)
```
```{r}
by_dba <- most_recent_cycle_inspection %>%
group_by(dba, cuisine = cuisine_description) %>%
summarize(locations = n(),
avg_score = mean(score),
median_score = median(score)) %>%
ungroup() %>%
arrange(desc(locations))
by_dba %>%
mutate(locations_bin = cut(locations, c(0, 1, 3, 10, Inf), labels = c("1", "2-3", "3-10", ">10"))) %>%
ggplot(aes(locations_bin, avg_score + 1)) +
geom_boxplot() +
scale_y_log10()
```
```{r}
by_cuisine <- by_dba %>%
group_by(cuisine) %>%
summarize(avg_score = mean(avg_score),
median_score = median(avg_score),
restaurants = n()) %>%
arrange(desc(restaurants))
library(broom)
cuisine_conf_ints <- by_dba %>%
add_count(cuisine) %>%
filter(n > 100) %>%
nest(-cuisine) %>%
mutate(model = map(data, ~ t.test(.$avg_score))) %>%
unnest(map(model, tidy))
cuisine_conf_ints %>%
mutate(cuisine = str_remove(cuisine, " \\(.*"),
cuisine = fct_reorder(cuisine, estimate)) %>%
ggplot(aes(estimate, cuisine)) +
geom_point() +
geom_errorbarh(aes(xmin = conf.low,
xmax = conf.high)) +
labs(x = "Average inspection score (higher means more violations)",
y = "Type of cuisine",
title = "Average inspection score by type of cuisine in NYC",
subtitle = "Each restaurant chain was counted once based on its average score")
```
```{r}
violation_cuisine_counts <- restaurant_inspections %>%
semi_join(most_recent_cycle_inspection, by = c("camis", "inspection_date")) %>%
count(critical_flag, violation_code, violation_description, cuisine = cuisine_description) %>%
group_by(violation_code) %>%
mutate(violation_total = sum(n)) %>%
group_by(cuisine) %>%
mutate(cuisine_total = sum(n)) %>%
ungroup() %>%
filter(violation_total >= 1000,
cuisine_total >= 2000) %>%
group_by(violation_description) %>%
mutate(fraction = n / cuisine_total,
avg_fraction = mean(fraction)) %>%
ungroup()
violation_cuisine_counts %>%
mutate(relative_risk = fraction / avg_fraction) %>%
arrange(desc(relative_risk)) %>%
filter(str_detect(violation_description, "mice"))
```
### What violations tend to occur together?
```{r}
library(widyr)
violations <- restaurant_inspections %>%
semi_join(most_recent_cycle_inspection, by = c("camis", "inspection_date")) %>%
filter(!is.na(violation_description))
violations %>%
pairwise_cor(violation_description, camis, sort = TRUE)
principal_components <- violations %>%
mutate(value = 1) %>%
widely_svd(violation_description, camis, value, nv = 6)
principal_components %>%
filter(dimension == 2) %>%
top_n(10, abs(value)) %>%
mutate(violation_description = str_sub(violation_description, 1, 60),
violation_description = fct_reorder(violation_description, value)) %>%
ggplot(aes(violation_description, value)) +
geom_col() +
coord_flip()
```