Skip to content

Commit cae26ed

Browse files
authored
Add files via upload
1 parent 1253474 commit cae26ed

File tree

1 file changed

+157
-0
lines changed

1 file changed

+157
-0
lines changed
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
####################This is the case study for the linear regression modeling###############
2+
#Our goal is to analyze which factor(s) contributes to effective social media campaign (more clicks, likes etc.)
3+
#Which attribute(s) best represents the success of social media campaign? (total number of likes? lifetime.engaged users? share?)
4+
5+
6+
######Step 1: Data exploration############
7+
#1.1 Load the data
8+
fb <- read.table("/Users/yuyanzhang/Desktop/RWorkshop/Week 7 - Case Study (regression modeling)/dataset_Facebook.csv", sep = ";", header = TRUE)
9+
#The header = TRUE parameter will tell R to read the first line in the file as the header of the dataset
10+
View(fb)
11+
12+
#1.2 Check the class of each attribute
13+
for (i in 1:ncol(fb)){
14+
print(paste(colnames(fb)[i],class(fb[,i]), sep = ": "))
15+
}
16+
17+
#Is there any attribute that needs recoding?
18+
fb$Category <- as.factor(fb$Category)
19+
fb$Post.Month <- as.factor(fb$Post.Month)
20+
fb$Post.Weekday <- as.factor(fb$Post.Weekday)
21+
fb$Post.Hour <- as.factor(fb$Post.Hour)
22+
fb$Paid <- as.factor(fb$Paid)
23+
24+
#1.3 Check for missing data
25+
which(is.na(fb))
26+
#Use imputation methods to replace missing data, or simply remove the entries with missing values
27+
#It's your call! But remember how you deal with missing values may influence the result of modeling.
28+
fb <- fb[complete.cases(fb),]
29+
which(is.na(fb))
30+
31+
#1.4 Look at the summary statistics of the dataaset.
32+
summary(fb)
33+
34+
#1.5 Plot the correlation matrix to see which attributes are highly correlated
35+
num_var <- c()
36+
for (i in 1:ncol(fb)){
37+
if ((class(fb[,i])=="integer") || class(fb[,i])=="numeric"){
38+
num_var <- c(num_var,i)
39+
}
40+
}
41+
symnum(cor(fb[,num_var]))
42+
43+
44+
#1.6 How does Post.Month influence Page.total.likes?
45+
#Hint: you can plot Page.total.likes agianst Post.Month
46+
#Alternatively, you can plot the average against Post.Month (use ggplot!)
47+
plot(fb$Post.Month,fb$Page.total.likes)
48+
library(ggplot2)
49+
ggplot(fb, aes(x=factor(Post.Month), y=Page.total.likes)) + stat_summary(fun.y="mean", geom="bar")
50+
#Does this trend make sense?
51+
#How helpful is it?
52+
53+
#1.7 How does Post.Month influence Lifetime.Engaged.Users?
54+
plot(fb$Post.Month,fb$Lifetime.Engaged.Users)
55+
ggplot(fb, aes(x=factor(Post.Month), y=Lifetime.Engaged.Users)) + stat_summary(fun.y="mean", geom="bar")
56+
57+
58+
#1.8 How does Post.Weekday influence Page.total.like?
59+
plot(fb$Post.Weekday,fb$Page.total.likes)
60+
#1.9 How does Post.Weekday influence Lifetime.Engaged.Users?
61+
plot(fb$Post.Weekday,fb$Lifetime.Engaged.Users)
62+
#1.10 Do the same analyses for Post.Hour
63+
plot(fb$Post.Hour,fb$Page.total.likes)
64+
plot(fb$Post.Hour,fb$Lifetime.Engaged.Users)
65+
66+
#1.11 Based on your exploratory analysis, when do you recommend to post?
67+
68+
69+
######Step 2: Predict Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post############
70+
#2.1 Create a linear model (lm1). Which attributes can we use?
71+
#Type,category,Post.Month,Post.Weekday,Post.Hour,Paid (Because we only have information for these attributes prior to releasing the campaign)
72+
lm1 <- lm(Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post~Type+Category+Post.Hour+Post.Month+Post.Weekday+Paid, data = fb)
73+
74+
75+
#2.2 Look at the summary statistics. Which attribute(s) is predictive at confidence level 0.05?
76+
summary(lm1)
77+
78+
#2.3 How does the model perform? Which metric(s) indicates the robustness of the model?
79+
AIC(lm1)
80+
BIC(lm1)
81+
82+
83+
#2.4 Create diagnostics plots of the model. Is there any violation of assumptions?
84+
par(mfrow = c(2,2))
85+
plot(lm1)
86+
par(mfrow = c(1,1))
87+
88+
89+
#2.5 Create a model (lm2) using only variables that are predictive at 0.05 confidence level
90+
lm2 <- lm(Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post~Type+Category+Post.Month, data = fb)
91+
92+
#2.6 Compare lm 2 to lm1. Which one is better and how do you know?
93+
AIC(lm2)
94+
anova(lm1,lm2)
95+
96+
#2.7 Create a stepwise model of lm1 (lm1.step).
97+
lm1.step <- step(lm1)
98+
99+
100+
#2.8 Look at the diagnositics plot of lm1.step and lm2. Is there any significant problem?
101+
plot(lm1.step)
102+
103+
104+
#Which model do you think is the best? lm1, lm2, or lm1.step?
105+
#Use adjusted R-squre, AIC, BIC, etc. to evaluate the performance of models
106+
107+
108+
###########Step3: Training set and testing set###########
109+
#Now, evaluate the model through generating predictions for unseen data
110+
#3.1 Setup the training and testing set (you can reserve 75% of the data as the training set)
111+
set.seed(123)
112+
train_idx <- sample(seq(1:nrow(fb)), size = 0.75*nrow(fb))
113+
train_data <- fb[train_idx,]
114+
test_data <- fb[-train_idx,]
115+
116+
#3.2 Use the training set to fit lm1 (lm1.train)
117+
lm1.train <- lm(Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post~Type+Category+Post.Hour+Post.Month+Post.Weekday+Paid, data = train_data)
118+
119+
#3.3 Use lm1.train to generate predictions for the testing set and compute the mse
120+
121+
mse <- function(p, r)
122+
{
123+
mean((p-r)^2)
124+
}
125+
126+
id <- which(!(test_data$Post.Hour %in% levels(train_data$Post.Hour)))
127+
test_data$Post.Hour[id] <- NA
128+
129+
lm1.pred <- predict.lm(lm1.train,test_data)
130+
mse(lm1.pred,test_data$Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post)
131+
132+
#3.4 Fit the stepwise model of lm1.train (lm.train.step)
133+
lm1.train.step <- step(lm1.train)
134+
135+
#3.5 Use lm.train.step to generate predictions for the testing set and compute the mse
136+
lm1.step.pred <- predict(lm1.train.step,test_data)
137+
mse(lm1.step.pred,test_data$Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post)
138+
139+
#3.6 Compare two models. Which one do you recommend and why?
140+
141+
##########Step 4: Transformation of response variable#####################
142+
#4.1 Why transforming the response variable: "Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post"?
143+
144+
145+
#4.2 Use a boxcot method to determine the optimal lambda for transformation
146+
library(MASS)
147+
L<-boxcox(lm1, plotit = F)$x[which.max(boxcox(lm1, plotit = F)$y)]
148+
L
149+
150+
#4.3 Transform the response variable and fit a new model (lm1.trans)
151+
lm1.trans <- lm(Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post^L~Type+Category+Post.Hour+Post.Month+Post.Weekday+Paid, data = fb)
152+
summary(lm1.trans)
153+
154+
#4.4 Look at new diagnostics
155+
par(mfrow=c(2,2))
156+
plot(lm1.trans, labels.id = NULL)
157+
par(mfrow=c(1,1))

0 commit comments

Comments
 (0)