|
1 | | -fire <- read.csv("C:/Users/Student/Documents/forestfires.csv") |
2 | | -View(fire) |
3 | | - |
4 | | -##################### |
5 | | -# |
6 | | -# Explore Data |
7 | | -# |
8 | | -#################### |
9 | | - |
10 | | -# Get summary information |
11 | | -summary(fire) |
12 | | -# No NA's, all numeric data is normalized |
13 | | - |
14 | | -# Check class of each attribute |
15 | | -for (i in 1:ncol(fire)){ |
16 | | - print(paste(colnames(fire[i]), ": ", class(fire[,i]),sep = "")) |
17 | | -} |
18 | | - |
19 | | -# Getting a subset of the data |
20 | | -summary(fire$area) |
21 | | -areabox <- boxplot(fire$area) |
22 | | -upperwhisk <- areabox$stats[5,] |
23 | | - |
24 | | -xtm_fire <- subset(fire, area >= upperwhisk, select = c('FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain', 'area')) |
25 | | - |
26 | | -#################### |
27 | | -# |
28 | | -# Basic modeling - Linear Regression |
29 | | -# |
30 | | -#################### |
31 | | - |
32 | | -# Plot two variables |
33 | | -plot(xtm_fire$temp, xtm_fire$area) |
34 | | -# Create linear regression model between the two |
35 | | -lm1 <- lm(area~temp, data = xtm_fire) |
36 | | -# Add regression line to plot |
37 | | -abline(lm1, col = "orange") |
38 | | -# Statistical information about this lm |
39 | | -summary(lm1) |
40 | | - |
41 | | -# Can add many more factors to this lm |
42 | | -lm2 <- lm(area~temp+FFMC+wind, data = xtm_fire) |
43 | | -summary(lm2) |
44 | | - |
45 | | -# Could interactions between variables help us? |
46 | | -pairs(xtm_fire[c('FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'area')]) |
47 | | -symnum(cor(xtm_fire[c('FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'area')])) |
48 | | -# Highly correlated: DC and DMC, ISI and FFMC, temp and FFMC |
49 | | -# Add these correlated attributes to a model |
50 | | -lm3 <- lm(area~temp+FFMC+wind+(DC+DMC)^2+(ISI+FFMC)^2+(temp+FFMC)^2, data = xtm_fire) |
51 | | -summary(lm3) |
52 | | - |
53 | | -# compare models |
54 | | -anova(lm1, lm2) |
55 | | -anova(lm1, lm3) |
56 | | -anova(lm2, lm3) |
57 | | - |
58 | | -##################### |
59 | | -# |
60 | | -# Practice Problems |
61 | | -# |
62 | | -#################### |
63 | | - |
64 | | -# 1. Create a new subset that includes only with an ISI (inital spread index) greater than the median |
65 | | - |
66 | | -# 2. Create a linear model with area and wind as your predictors, and area as your response |
67 | | -# Call this model lm1_ISI |
68 | | - |
69 | | -# 3. Determine if there are any correlated attributes |
70 | | - |
71 | | -# 4. If there are any correlated attributes add the interaction between them to a new model |
72 | | -# Call this model lm2_ISI |
73 | | - |
74 | | -# 5. Compare these two models and determine which model is better at predicting the size of the area burned |
| 1 | +fire <- read.csv("C:/Users/Student/Documents/forestfires.csv") |
| 2 | +View(fire) |
| 3 | + |
| 4 | +##################### |
| 5 | +# |
| 6 | +# Explore Data |
| 7 | +# |
| 8 | +#################### |
| 9 | + |
| 10 | +# Get summary information |
| 11 | +summary(fire) |
| 12 | +# No NA's, all numeric data is normalized |
| 13 | + |
| 14 | +# Check class of each attribute |
| 15 | +for (i in 1:ncol(fire)){ |
| 16 | + print(paste(colnames(fire[i]), ": ", class(fire[,i]),sep = "")) |
| 17 | +} |
| 18 | + |
| 19 | +# Getting a subset of the data |
| 20 | +summary(fire$area) |
| 21 | +areabox <- boxplot(fire$area) |
| 22 | +upperwhisk <- areabox$stats[5,] |
| 23 | + |
| 24 | +xtm_fire <- subset(fire, area >= upperwhisk, select = c('FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain', 'area')) |
| 25 | + |
| 26 | +#################### |
| 27 | +# |
| 28 | +# Basic modeling - Linear Regression |
| 29 | +# |
| 30 | +#################### |
| 31 | + |
| 32 | +# Plot two variables |
| 33 | +plot(xtm_fire$temp, xtm_fire$area) |
| 34 | +# Create linear regression model between the two |
| 35 | +lm1 <- lm(area~temp, data = xtm_fire) |
| 36 | +# Add regression line to plot |
| 37 | +abline(lm1, col = "orange") |
| 38 | +# Statistical information about this lm |
| 39 | +summary(lm1) |
| 40 | + |
| 41 | +# Can add many more factors to this lm |
| 42 | +lm2 <- lm(area~temp+FFMC+wind, data = xtm_fire) |
| 43 | +summary(lm2) |
| 44 | + |
| 45 | +# Could interactions between variables help us? |
| 46 | +pairs(xtm_fire[c('FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'area')]) |
| 47 | +symnum(cor(xtm_fire[c('FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'area')])) |
| 48 | +# Highly correlated: DC and DMC, ISI and FFMC, temp and FFMC |
| 49 | +# Add these correlated attributes to a model |
| 50 | +lm3 <- lm(area~temp+FFMC+wind+(DC+DMC)^2+(ISI+FFMC)^2+(temp+FFMC)^2, data = xtm_fire) |
| 51 | +summary(lm3) |
| 52 | + |
| 53 | +# compare models |
| 54 | +anova(lm1, lm2) |
| 55 | +anova(lm1, lm3) |
| 56 | +anova(lm2, lm3) |
| 57 | + |
| 58 | +##################### |
| 59 | +# |
| 60 | +# Practice Problems |
| 61 | +# |
| 62 | +#################### |
| 63 | + |
| 64 | +# 1. Create a new subset that includes only with an ISI (inital spread index) greater than the median |
| 65 | + |
| 66 | +# 2. Create a linear model with area and wind as your predictors, and area as your response |
| 67 | +# Call this model lm1_ISI |
| 68 | + |
| 69 | +# 3. Determine if there are any correlated attributes |
| 70 | + |
| 71 | +# 4. If there are any correlated attributes add the interaction between them to a new model |
| 72 | +# Call this model lm2_ISI |
| 73 | + |
| 74 | +# 5. Compare these two models and determine which model is better at predicting the size of the area burned |
0 commit comments