1+ # ###################This is the case study for the linear regression modeling###############
2+ # Our goal is to analyze which factor(s) contributes to effective social media campaign (more clicks, likes etc.)
3+ # Which attribute(s) best represents the success of social media campaign? (total number of likes? lifetime.engaged users? share?)
4+
5+
6+ # #####Step 1: Data exploration############
7+ # 1.1 Load the data
8+ fb <- read.table(" /Users/yuyanzhang/Desktop/RWorkshop/Week 7 - Case Study (regression modeling)/dataset_Facebook.csv" , sep = " ;" , header = TRUE )
9+ # The header = TRUE parameter will tell R to read the first line in the file as the header of the dataset
10+ View(fb )
11+
12+ # 1.2 Check the class of each attribute
13+ for (i in 1 : ncol(fb )){
14+ print(paste(colnames(fb )[i ],class(fb [,i ]), sep = " : " ))
15+ }
16+
17+ # Is there any attribute that needs recoding?
18+ fb $ Category <- as.factor(fb $ Category )
19+ fb $ Post.Month <- as.factor(fb $ Post.Month )
20+ fb $ Post.Weekday <- as.factor(fb $ Post.Weekday )
21+ fb $ Post.Hour <- as.factor(fb $ Post.Hour )
22+ fb $ Paid <- as.factor(fb $ Paid )
23+
24+ # 1.3 Check for missing data
25+ which(is.na(fb ))
26+ # Use imputation methods to replace missing data, or simply remove the entries with missing values
27+ # It's your call! But remember how you deal with missing values may influence the result of modeling.
28+ fb <- fb [complete.cases(fb ),]
29+ which(is.na(fb ))
30+
31+ # 1.4 Look at the summary statistics of the dataaset.
32+ summary(fb )
33+
34+ # 1.5 Plot the correlation matrix to see which attributes are highly correlated
35+ num_var <- c()
36+ for (i in 1 : ncol(fb )){
37+ if ((class(fb [,i ])== " integer" ) || class(fb [,i ])== " numeric" ){
38+ num_var <- c(num_var ,i )
39+ }
40+ }
41+ symnum(cor(fb [,num_var ]))
42+
43+
44+ # 1.6 How does Post.Month influence Page.total.likes?
45+ # Hint: you can plot Page.total.likes agianst Post.Month
46+ # Alternatively, you can plot the average against Post.Month (use ggplot!)
47+ plot(fb $ Post.Month ,fb $ Page.total.likes )
48+ library(ggplot2 )
49+ ggplot(fb , aes(x = factor (Post.Month ), y = Page.total.likes )) + stat_summary(fun.y = " mean" , geom = " bar" )
50+ # Does this trend make sense?
51+ # How helpful is it?
52+
53+ # 1.7 How does Post.Month influence Lifetime.Engaged.Users?
54+ plot(fb $ Post.Month ,fb $ Lifetime.Engaged.Users )
55+ ggplot(fb , aes(x = factor (Post.Month ), y = Lifetime.Engaged.Users )) + stat_summary(fun.y = " mean" , geom = " bar" )
56+
57+
58+ # 1.8 How does Post.Weekday influence Page.total.like?
59+ plot(fb $ Post.Weekday ,fb $ Page.total.likes )
60+ # 1.9 How does Post.Weekday influence Lifetime.Engaged.Users?
61+ plot(fb $ Post.Weekday ,fb $ Lifetime.Engaged.Users )
62+ # 1.10 Do the same analyses for Post.Hour
63+ plot(fb $ Post.Hour ,fb $ Page.total.likes )
64+ plot(fb $ Post.Hour ,fb $ Lifetime.Engaged.Users )
65+
66+ # 1.11 Based on your exploratory analysis, when do you recommend to post?
67+
68+
69+ # #####Step 2: Predict Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post############
70+ # 2.1 Create a linear model (lm1). Which attributes can we use?
71+ # Type,category,Post.Month,Post.Weekday,Post.Hour,Paid (Because we only have information for these attributes prior to releasing the campaign)
72+ lm1 <- lm(Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post ~ Type + Category + Post.Hour + Post.Month + Post.Weekday + Paid , data = fb )
73+
74+
75+ # 2.2 Look at the summary statistics. Which attribute(s) is predictive at confidence level 0.05?
76+ summary(lm1 )
77+
78+ # 2.3 How does the model perform? Which metric(s) indicates the robustness of the model?
79+ AIC(lm1 )
80+ BIC(lm1 )
81+
82+
83+ # 2.4 Create diagnostics plots of the model. Is there any violation of assumptions?
84+ par(mfrow = c(2 ,2 ))
85+ plot(lm1 )
86+ par(mfrow = c(1 ,1 ))
87+
88+
89+ # 2.5 Create a model (lm2) using only variables that are predictive at 0.05 confidence level
90+ lm2 <- lm(Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post ~ Type + Category + Post.Month , data = fb )
91+
92+ # 2.6 Compare lm 2 to lm1. Which one is better and how do you know?
93+ AIC(lm2 )
94+ anova(lm1 ,lm2 )
95+
96+ # 2.7 Create a stepwise model of lm1 (lm1.step).
97+ lm1.step <- step(lm1 )
98+
99+
100+ # 2.8 Look at the diagnositics plot of lm1.step and lm2. Is there any significant problem?
101+ plot(lm1.step )
102+
103+
104+ # Which model do you think is the best? lm1, lm2, or lm1.step?
105+ # Use adjusted R-squre, AIC, BIC, etc. to evaluate the performance of models
106+
107+
108+ # ##########Step3: Training set and testing set###########
109+ # Now, evaluate the model through generating predictions for unseen data
110+ # 3.1 Setup the training and testing set (you can reserve 75% of the data as the training set)
111+ set.seed(123 )
112+ train_idx <- sample(seq(1 : nrow(fb )), size = 0.75 * nrow(fb ))
113+ train_data <- fb [train_idx ,]
114+ test_data <- fb [- train_idx ,]
115+
116+ # 3.2 Use the training set to fit lm1 (lm1.train)
117+ lm1.train <- lm(Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post ~ Type + Category + Post.Hour + Post.Month + Post.Weekday + Paid , data = train_data )
118+
119+ # 3.3 Use lm1.train to generate predictions for the testing set and compute the mse
120+
121+ mse <- function (p , r )
122+ {
123+ mean((p - r )^ 2 )
124+ }
125+
126+ id <- which(! (test_data $ Post.Hour %in% levels(train_data $ Post.Hour )))
127+ test_data $ Post.Hour [id ] <- NA
128+
129+ lm1.pred <- predict.lm(lm1.train ,test_data )
130+ mse(lm1.pred ,test_data $ Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post )
131+
132+ # 3.4 Fit the stepwise model of lm1.train (lm.train.step)
133+ lm1.train.step <- step(lm1.train )
134+
135+ # 3.5 Use lm.train.step to generate predictions for the testing set and compute the mse
136+ lm1.step.pred <- predict(lm1.train.step ,test_data )
137+ mse(lm1.step.pred ,test_data $ Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post )
138+
139+ # 3.6 Compare two models. Which one do you recommend and why?
140+
141+ # #########Step 4: Transformation of response variable#####################
142+ # 4.1 Why transforming the response variable: "Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post"?
143+
144+
145+ # 4.2 Use a boxcot method to determine the optimal lambda for transformation
146+ library(MASS )
147+ L <- boxcox(lm1 , plotit = F )$ x [which.max(boxcox(lm1 , plotit = F )$ y )]
148+ L
149+
150+ # 4.3 Transform the response variable and fit a new model (lm1.trans)
151+ lm1.trans <- lm(Lifetime.People.who.have.liked.your.Page.and.engaged.with.your.post ^ L ~ Type + Category + Post.Hour + Post.Month + Post.Weekday + Paid , data = fb )
152+ summary(lm1.trans )
153+
154+ # 4.4 Look at new diagnostics
155+ par(mfrow = c(2 ,2 ))
156+ plot(lm1.trans , labels.id = NULL )
157+ par(mfrow = c(1 ,1 ))
0 commit comments