Skip to content

Commit 6745985

Browse files
author
LProcopi15
committed
Week 8
1 parent cae26ed commit 6745985

File tree

3 files changed

+143
-1
lines changed

3 files changed

+143
-1
lines changed
63 KB
Binary file not shown.

Week 7 - Case Study (regression modeling)/lecture7_case study.R

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,17 @@
55

66
######Step 1: Data exploration############
77
#1.1 Load the data
8-
fb <- read.table("/Users/yuyanzhang/Desktop/RWorkshop/Week 7 - Case Study (regression modeling)/dataset_Facebook.csv", sep = ";", header = TRUE)
8+
fb <- read.table("C:/Users/Student/Documents/UVA 2016-2017/RWorkshop/Week 7 - Case Study (regression modeling)/dataset_Facebook.csv", sep = ";", header = TRUE)
99
#The header = TRUE parameter will tell R to read the first line in the file as the header of the dataset
1010
View(fb)
1111

1212
#1.2 Check the class of each attribute
13+
check.class <- function(dataset) {
14+
for(i in 1:ncol(dataset))
15+
print(paste(colnames(dataset[i]), ": ", class(dataset[,i]), sep = ""))
16+
}
1317

18+
check.class(fb)
1419
#Is there any attribute that needs recoding?
1520

1621

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
bike <- read.csv("C:/Users/Student/Documents/UVA 2016-2017/RWorkshop/Week 1- Arithmatic and Data Type Intro/bike.csv")
2+
3+
###################
4+
#
5+
# Time Series
6+
#
7+
###################
8+
9+
bike.ts <- ts(bike$cnt)
10+
11+
# In time series look for 3 things:
12+
# 1. Trend - overall long acting upward or downward movement
13+
# 2. Seasonality - repeating patterns within a year
14+
# 3. Cycles - patterns that repeat in over a year period
15+
plot(bike.ts)
16+
17+
# 1. Modeling for trend
18+
19+
# Create a new variable time.bike which is a matrix of (length(bike.ts))
20+
time.bike<-c(1:(length(bike.ts)-7))
21+
22+
# Build a new model, bike.trend which predicts bike.ts based on the time variable, time.bike- use all data except the last week of your bike time series
23+
bike.trend<-lm(bike.ts[time.bike]~time.bike)
24+
25+
# Use the summary() command on bike.trend
26+
summary(bike.trend)
27+
28+
# Is time significant in predicting spam frequency? Yes; p-value <= 0.001
29+
# Add trend line to time series plot
30+
plot(bike.ts)
31+
abline(bike.trend,col='red') # Overall upward trend
32+
33+
# 2. Modeling for seasonality
34+
35+
# Get the periodogram from bike.ts
36+
pg.bike<-spec.pgram(bike.ts,spans=9,demean=T,log='no')
37+
38+
# Find the peak, max.omega.bike
39+
max.omega.bike<-pg.bike$freq[which(pg.bike$spec==max(pg.bike$spec))]
40+
41+
# What is the period?
42+
1/max.omega.bike # 23.97; there is a repeating cycle every ~24 days
43+
44+
# Conclusion: there is both seasonality and trend in this data set - need to address both of these in the model
45+
# Get the residuals for trend
46+
e.ts.bike <- ts(bike.trend$residuals)
47+
48+
49+
# Plot autocorrelation (acf) and partial autocorrelation (pacf)
50+
par(mfrow=c(1,2))
51+
acf(e.ts.bike, main="ACF of Residuals from e.ts.bike")
52+
pacf(e.ts.bike,main="PACF of Residuals from e.ts.bike")
53+
par(mfrow=c(1,1))
54+
55+
# Use the ACF and PACF to choose a model type
56+
# See table for details
57+
# ACF is sidusodial and PACF cannot see any trends
58+
# Choose AR or ARMA
59+
60+
# Choose r values for AR model
61+
# ar(3) p=3
62+
bike.ar3 <- arima(e.ts.bike, order=c(4,0,0))
63+
summary(bike.ar3)
64+
AIC(bike.ar3)
65+
66+
# Use the auto.arima from 'forecast' library- Automatically find p, q, & d terms
67+
library('forecast')
68+
bike.auto <- auto.arima(e.ts.bike, trace=TRUE)
69+
# Using autoregressive function the best model is ARIMA(2,1,1)
70+
AIC(bike.auto)
71+
72+
# Transform weekly info
73+
Day <- rep(NA, length(bike.ts)-7)
74+
Day[which((time.bike %% 7) == 1)] <- "Sat"
75+
Day[which((time.bike %% 7) == 2)] <- "Sun"
76+
Day[which((time.bike %% 7) == 3)] <- "Mon"
77+
Day[which((time.bike %% 7) == 4)] <- "Tue"
78+
Day[which((time.bike %% 7) == 5)] <- "Wed"
79+
Day[which((time.bike %% 7) == 6)] <- "Thr"
80+
Day[which((time.bike %% 7) == 0)] <- "Fri"
81+
82+
Day <- as.factor(Day)
83+
84+
# View default contrasts
85+
contrasts(Day)
86+
87+
# Build a model bike.season to model the trend and seasonality of ham.
88+
bike.season<- lm(bike.ts[time.bike]~Day)
89+
summary(bike.season)
90+
91+
# Get the residuals from the bike.season model above and store in e.ts.bike2:
92+
e.ts.bike2 <- ts(bike.season$residuals)
93+
94+
# Plot acf and pacf side by side for easier examination
95+
par(mfrow=c(1,2))
96+
acf(e.ts.bike2, main="ACF of Residuals from bike.season")
97+
pacf(e.ts.bike2,main="PACF of Residuals from bike.season")
98+
par(mfrow=c(1,1))
99+
100+
# Sinusodial decay on ACF and not on PACF - AR or ARMA
101+
bike.arma13 <- arima(e.ts.bike2, order=c(1,0,3))
102+
103+
# Use the auto.arima from 'forecast' library- Automatically find p, q, & d terms
104+
bike2.auto <- auto.arima(e.ts.bike2, trace=TRUE)
105+
# Best model is ARIMA(2,1,1)
106+
107+
###################
108+
#
109+
# Practice problems
110+
#
111+
###################
112+
113+
# 1. Create a time series dataset base on 'causual'
114+
115+
# 2. Create a model for the trend
116+
117+
# 3. Plot the ts data and the trend line
118+
119+
# 4. Determine if trend is significant
120+
121+
# 5. Obtain the residuals from the trend model
122+
123+
# 6. Plot the ACF and the PACF for this model and determine if AR, ARMA, or ARIMa should be used
124+
125+
# 7. Use auto.arima to determine the optimal model; also create a few other models with varying values for p, q and r
126+
127+
# 8. Compare models using AIC, BIC and adjs-R2
128+
129+
# 9. Create a periodgram for this data
130+
131+
# 10. Determine if seasonality is significant
132+
133+
# 11. Obtain the residuals, plot the ACF and PACF and interpret them
134+
135+
# 12. Create appropriate models for seasonality
136+
137+
# 13. Compare models using AIC, BIC and adjst-R2

0 commit comments

Comments
 (0)