Skip to content

Commit 4e01ff4

Browse files
authored
Added categorical variables, training/testing sets
1 parent ec84645 commit 4e01ff4

File tree

1 file changed

+82
-20
lines changed

1 file changed

+82
-20
lines changed

Week 5 - Introduction to Modeling/Lecture5.R

Lines changed: 82 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
fire <- read.csv("C:/Users/Student/Documents/UVA 2016-2017/RWorkshop/Week 5 - Introduction to Modeling/forestfires.csv")
1+
fire <- read.csv("/Users/yuyanzhang/Desktop/RWorkshop/Week 5 - Introduction to Modeling/forestfires.csv")
22
View(fire)
33

44
#####################
@@ -11,7 +11,6 @@ View(fire)
1111
summary(fire)
1212
# No NA's, all numeric data is normalized
1313

14-
1514
# Check class of each attribute
1615
for (i in 1:ncol(fire)){
1716
print(paste(colnames(fire[i]), ": ", class(fire[,i]),sep = ""))
@@ -24,12 +23,6 @@ upperwhisk <- areabox$stats[5,]
2423

2524
xtm_fire <- subset(fire, area >= upperwhisk, select = c('FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain', 'area'))
2625

27-
# Transforming the data
28-
hist(fire$area) # highly skewed to the left
29-
hist(log(fire$area))
30-
31-
fire$area_log <- log(fire$area + 0.1)
32-
3326
####################
3427
#
3528
# Basic modeling - Linear Regression
@@ -45,16 +38,7 @@ abline(lm1, col = "orange")
4538
# Statistical information about this lm
4639
summary(lm1)
4740

48-
# Plot two variables
49-
plot(fire$temp, fire$area_log)
50-
# Create linear regression model between the two
51-
lm1_log <- lm(area_log~temp, data = fire)
52-
# Add regression line to plot
53-
abline(lm1_log, col = "orange")
54-
# Statistical information about this lm
55-
summary(lm1_log)
56-
57-
# Can add many more factors to any lm
41+
# Can add many more factors to this lm
5842
lm2 <- lm(area~temp+FFMC+wind, data = xtm_fire)
5943
summary(lm2)
6044

@@ -67,12 +51,12 @@ lm3 <- lm(area~temp+FFMC+wind+(DC+DMC)^2+(ISI+FFMC)^2+(temp+FFMC)^2, data = xtm_
6751
summary(lm3)
6852

6953
# compare models
70-
anova(lm1, lm1_log)
71-
anova(lm1, lm2)
54+
anova(lm1, lm2)
7255
# Large p-value means that the additional factors do not contribute to predicting the value of the response
7356
anova(lm1, lm3)
7457
anova(lm2, lm3)
7558

59+
7660
#####################
7761
#
7862
# Practice Problems
@@ -94,3 +78,81 @@ anova(lm2, lm3)
9478
# Call this model lm2_ISI
9579

9680
# 7. Compare these two models and determine which model is better at predicting the size of the area burned
81+
82+
83+
#####################
84+
#
85+
# Categorical variables
86+
#
87+
###################
88+
89+
#Suppose we want to develop a model to predict the area and also using month and day attributes
90+
class(fire$month)
91+
contrasts(fire$month)
92+
class(fire$day)
93+
contrasts(fire$day)
94+
95+
#Get the right dataset
96+
xtm_fire_withCategorical <- subset(fire, area >= upperwhisk, select = c('FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain', 'area','month','day'))
97+
lm4 <- lm(area~temp+FFMC+wind+(DC+DMC)^2+(ISI+FFMC)^2+(temp+FFMC)^2+month+day, data = xtm_fire_withCategorical)
98+
99+
#Practice problem: is lm4 better than lm3? How do you know?
100+
101+
102+
103+
#####################
104+
#
105+
# Evaluating the performance of model: training set and testing set
106+
#
107+
###################
108+
109+
#Why? Because we want to test the robustness of the model on unseen data
110+
111+
#Suppose we want to develop a model to predict the temp. Let's use the entire dataset (not just extreme cases)
112+
113+
#Set up training set and testing set
114+
set.seed(123456) #Setting the seed of the random process
115+
smp_size <- floor(0.75 * nrow(fire)) #We are seperating 75% of the data as the training data, and 25% as the testing data
116+
train_ind <- sample(seq_len(nrow(fire)), size = smp_size)
117+
train <- fire[train_ind, ]
118+
test <- fire[-train_ind, ]
119+
120+
#Train the model on the training set
121+
lm.train <- lm(temp~area+FFMC+wind+(DC+DMC)^2+(ISI+FFMC)^2+FFMC+month+day, data = train)
122+
summary(lm.train)
123+
124+
#Use the trained model to generate predictions for the testing sets
125+
predict(lm.train,test) #The command gives a list of predicted value for each datapoint in the testing set
126+
pred <- as.vector(predict(lm.train,test))
127+
128+
#Evaluate the performance of the model based on the predictions: calculating the MSE (mean squared error) of predictions
129+
mse <- function(p, r)
130+
{
131+
mean((p-r)^2)
132+
133+
}
134+
135+
pmse.lm.train<-mse(pred,test$temp)
136+
pmse.lm.train
137+
138+
#Practice question: how does pmse.lm.train compared to the MSE on the training set? Which value better represents the robustness of the model?
139+
#Hint: use mean((lm.train$residuals)^2) to compute the MSE on training set
140+
141+
#####################
142+
#
143+
# Practice Problems
144+
#
145+
###################
146+
#1. Develop a linear model to predict the rain, using all attributes (hint: you can write the formula this way: lm(rain~., data = fire) to indicate you are using all attributes)
147+
148+
#2. Look at the summary of the model. Which attributes are statistically significant? (with p-value < 0.05)
149+
150+
#3. Develop a linear model only using attributes that are statistically significant
151+
152+
#4. Compare model 1 and model 3. Are they statistically different? Which one would you choose?
153+
154+
#5. Split the dataset into trainning and testing set. Then use the training set to train the model you selected.
155+
156+
#6. Generate predictions for the testing sets.
157+
158+
#7. Compute the mean squared error of the predictions

0 commit comments

Comments
 (0)