R_Files/Poisson.R at master · svkerr/R_Files · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
## Poisson and Negative Binomial Count Models
# Text: Modeling Count Data

# Load libraries and appropriate data sets
library("COUNT")  # Text: Modeling Count Data
library(faraway)  # Text: Extending the Linear Model with R
library(dplyr)
data("smoking")
attach(smoking)    # so i won't have to use smoking$variable

# Let's do a brief linear regression
reg1 = lm(sbp ~ male + smoker + age, data=smoking)
summary(reg1)

# Predicted values may be obtained
mu = predict(reg1)    # mu is the expected value with the error (epsilon) contained. equivalent to "fitted.values" attribute of the regression (reg1)
mu

# Let's calculate the residuals manually and verify they equal the model residuals
sbp - mu

# Let's start the Poisson - with a basic plot with a mean = 2:
x = c(0,1,2,3,4,5)
u = 2
plot((exp(-u)* u**x)/factorial(x))

###### Poisson Regression: Constructing a True Poisson Model ################################
library(MASS)
set.seed(4590); nobs = 50000
x1 = runif(nobs); x2 = runif(nobs); x3 = runif(nobs) # the explanatory variables
py = rpois(nobs,exp(1 + 0.75*x1 - 1.25*x2 + 0.5*x3))  # Note: B0,B1,B2,B3
cnt = table(py)
dataf = data.frame(prop.table(table(py)))
dataf$cumulative = cumsum(dataf$Freq)
datafall = data.frame(cnt,dataf$Freq*100, dataf$cumulative*100)
datafall; summary(py)

# generate a poisson model using the above data (generated by a poisson distribution)
py1 = glm(py ~ x1 + x2 + x3, family=poisson)
summary(py1)
confint.default(py1)
py1$aic/(py1$df.null + 1)
pr = resid(py1,type="pearson")
pchi2 = sum(residuals(py1,type="pearson")^2) # Pearson's chi-squared goodness-of-fit statistic
disp = pchi2/py1$df.residual # Pearson Dispersion Statistic (should be = 1)
pchi2;disp

###### Poisson Regression: Modeling Real Data ####################################
library(msme)
library(COUNT)
data(rwm5yr)
rwm1984 = subset(rwm5yr,year==1984)
# Knowing the dataset is vital to the modeling process. so let's table(docvis)
docvis_cnt = table(rwm1984$docvis); docvis_cnt
dataf_docvis = data.frame(prop.table(table(rwm1984$docvis)))
dataf_docvis$cumulative = cumsum(dataf_docvis$Freq)
dataf_docvis_all = data.frame(docvis_cnt,dataf_docvis$Freq*100, dataf_docvis$cumulative * 100)
dataf_docvis_all
# notice there are lots of 0 counts
mean(rwm1984$docvis); var(rwm1984$docvis)
# Given a mean of 3.162881 a Poisson distr will be expected to have below # of zeroes:
exp(-3.162881)*3.162881^0/factorial(0)
# We expect 4.2% zeroes versus the observed value of 41.6%
# Thus the model is Poisson overdispersed. We also see that by comparing the population mean and variance (should be nearly equal)

# Two other explanatory variables that we will work with are 'outwork' and 'age'.
# There are 40 distinct ages:
length(unique(rwm1984$age))
# A best practice will center a continuous predictor when it starts far from 0, as in the case with age. We will do so.
# NOTE: centering changes only the intercept in the model, all other predictor coefficients and standard errors, and fit statistics, stay the same. We will interpret age differently when centered.
cage = rwm1984$age - mean(rwm1984$age)
poic = glm(docvis ~ outwork + age, family=poisson, data=rwm1984)  # The model
poic = glm(docvis ~ outwork + cage, family=poisson, data=rwm1984)  # The model
summary(poic)
pr = sum(residuals(poic,type="pearson")^2)     # Pearson Chi2
pr/poic$df.residual                            # Dispersion Statistic
# We see that the dispersion statistic of 11.34 exceeds the ideal value for a Poisson Model of 1.0

# In summary at this point:
# 1 There are excessive 0 counts in docvis given its mean (and assumed poisson pdf)
# 2 The variance of docvis far exceeds its mean, and
# 3 The dispersion stat of (11.34) is much greater than 1,
# It's not unwarranted to conclude that the model is truly overdispersed. The predictors all have p-values under 0.05 and thereby appear to significantly contributed to understanding (and predicting) docvis --- but they do not. The bias resulting from overdispersion means that p-values tell us nothing about the relationship of predictor to response. Remember this!!

#### Interpreting Poisson model coefficients
# The coefficient B_j in general is the change in the "log-count" of the response for a one-unit change in the predictor. Note for a binary predictor, this is from a change from 0 to 1. When a continuous predictor is centered, the reference is its mean value.

# Let's take another look at the data (without centering age)
rwm1984 = subset(rwm5yr, year == 1984)
poi1 = glm(docvis ~ outwork + age, family=poisson, data = rwm1984)
summary(poi1)
pr = sum(resid(poi1,type="pearson")^2);pr  # Pearson Chi2
pr/poi1$df.residual                         # Dispersion statistic
poi1$aic/(poi1$df.null + 1)                 # AIC/n
exp(coef(poi1))                             # IRR (Incidence Rate Ratios)
xbp <- predict(poi1)                        # xb, linear predictor
mup <- exp(xbp)                             # mu, fitted Poisson; equivalently predict(poi1,type="response") or fitted(poi1)
mean(mup)                                   # expected variance, mean equals variance

# Table of observed versus expected counts:
foo <- rbind(seq(from=0,to=17),obs=table(rwm1984$docvis)[1:18], exp = round(sapply(0:17, function(x)sum(dpois(x,fitted(poi1))))))


# Note: Per calculated above, the IRR indicates the ratio for the rate counts between two ascending contiguous levels of the response. We interpret the IRRs:
# 1 Out of work patients had 1.5 times more visits than patients who were working (age held constant)
# 2 Patients visited a physician about 2.2% more often with each year older in age (if age is centered for each year difference form the mean age, there is about a 2.2% decrease of increase in the mean number of visits, outwork held constant)
# 3 Probabilistically: out of work patients were 1.5 times more likely (or more probable) to visit a physician than working patients.
exp(coef(poi1)) * sqrt(diag(vcov(poi1)))    # delta method
# Note: The standard errors are the square root of the diagonal terms of the inverse negative of the Hessian. This can be found by also taking the square root of the diagonal terms of the variance-covariance matrix of the model coefficients. Per above.

coef(poi1)/sqrt(diag(vcov(poi1)))
# Note: A z-value is simply the ratio of the coefficient and associated standard error. Per above
exp(confint.default(poi1))                  # CI of IRR

## Rate Ratios and Probability
# In order to have a change in predictor value reflect a change in actual visits, we must exponentiate the coefficient.

## Modeling over Time, Area and Space
data(fasttrakg)
str(fasttrakg)
glimpse(fasttrakg)
# fastrtrakg data:
# die = count of number of deaths (response)
# anterior = did patient have a previous anterior infarction
# hcabg = did patient have a history of coronary bypass grafting
# killip = summary of cardiovascular health indicators (bigger number is bad)
fast = glm(die ~ anterior + hcabg + factor(killip), family="poisson", offset=log(cases),data=fasttrakg)
summary(fast)
exp(coef(fast))                            # IRR (Incident Rate Ratios)
exp(coef(fast)) * sqrt(diag(vcov(fast)))   # delta method
exp(confint.default(fast))                 # CI of IRR
modelfit(fast)
pr = sum(residuals(fast,type="pearson")^2)  # Pearson Chi2
pr/fast$df.residual                         # Dispersion statistic

# Interpretation of above model results:
# 1 Patients having an anterior site heart attrack are twice as likely to die than if damage was to other part of the heart
# 2 Patients with history of having a CABG procedure are twice as likely to die than if they did not have a procedure
# 3 Patients having a killip 2 status are 2.5 times as likely to die than if they have a killip level 1 status; having a killip 3 status are 3 times as likely to die and those at killip level 4 are 12 times more likely to die than those with no apparent heart problems
# 4 Those at first the dispersion statistic of 1.4 may appear low, the added 40% dispersion may prepresent a lack of model fit. Note: After graphing fitted.values(fast) to residuals(fast) there appears to be one outlier (the 2nd and 10th observations):
library(ggvis)
foo = data.frame(cbind(fitted.values(fast),residuals(fast)))
foo %>%
  ggvis(~X1, ~X2, fill:="red") %>%
  layer_points() %>%
  layer_model_predictions(model = "lm")

## Prediction
myglm = glm(docvis ~ outwork + age, family=poisson, data=rwm1984)
summary(myglm)
# Let's predict number of doc visits for a 50 year-old working patient (outwork = 0, so no need to include coef * 0). Using the model's coefficients:
working_pred = 50 * coef(myglm)[3] + coef(myglm)[1]  # remember - this gives you log(mu)
exp(working_pred)
# On basis of this model, we predict a working 50 year-old german will visit the doctor 3 times in 1984

# Predicted counts and their 95% confidence interval may be obtained from:
lpred = predict(myglm, newdata=rwm1984,type="link", se.fit=TRUE)
up = lpred$fit + (1.96*lpred$se.fit)
lo = lpred$fit - (1.96*lpred$se.fit)
eta = lpred$fit
mu = myglm$family$linkinv(eta)
upci = myglm$family$linkinv(up)
loci = myglm$family$linkinv(lo)
summary(loci)
summary(mu)
summary(upci)
layout(1); plot(eta, mu); lines(eta, loci, col=2, type='p');
lines(eta,upci, col=3, type='p')

##### Testing Overdispersion ###############################################
# Basics of Goodness-of-Fit
# The Deviance GOF test is based on the view that the deviance is distributed as Chi2. Deviance is in effect a measure of the distance between the most full (saturated) model we can fit, and the proposed model we are testing for fit.
# Using the German health data:
library(COUNT)
data(rwm5yr)
rwm1984 = subset(rwm5yr, year==1984)

mymod = glm(docvis ~ outwork + age, family=poisson, data=rwm1984)
mymod
dev = deviance(mymod)
df = df.residual(mymod)
p_value = 1 - pchisq(dev,df)
p_value
# If the resulting Chi2 p-value is less than 0.05, the model is considered well fit. Since the Chi2 p-value is less than 0.05, our model is a good fit
# This test statistic evaluates whether the value of the deviance, for a specific model size, is close enough to that of the saturated model that it cannot be rejected as fitting

# Calculate Dispersion Statistics and p-values
mymod = glm(docvis ~ outwork + age, family=poisson, data=rwm1984)
pr = sum(residuals(mymod, type="pearson")^2)    # Pearson Chi2
pr/mymod$df.residual                            # Dispersion statistic
pchisq(pr, mymod$df.residual, lower=F)          # Calculate p-value
pchisq(mymod$deviance, mymod$df.residual,lower=F) # Previously calculated above differently

# Function to calculate Pearson Chi2 and Dispersion Statistics
# Though we don't use the Pearson Chi2 statistic for a GOF test (due to bias), we do use it in defining overdispersion - Poisson overdispersion
# The Pearson Chi2 statistic is the squared residuals weighted (adjusted) by the model variance, and summed over all observations in the model
P__disp = function(x) {
  pr = sum(residuals(x, type="pearson")^2)
  dispersion = pr/x$df.residual
  pchisq(pr,x$df.residual, lower=F)
  cat("\n Pearson Chi2 = ", pr ,
      "\n Dispersion = ", dispersion, "\n")

}

# Overdispersion in a Poisson model occurs when the variance of the response is greater than its' mean. In general, is the occasion when the observed variance in a model is greater than its expected variance
# Primary causes: 1) positive correlation between responses 2) excess variation between response probabilities or counts 3) Violations in distribution assumptions
# Why is overdispersion a problem: May cause standard errors of the estimates to be underestimated (i.e., a variable may appear to be signficant predictor when it is not)
# What causes it: 1) model omits important explanatory variables 2) data includes outilers 3) model fails to included a needed interaction 4) predictor needs to be transformed 5) link function is misspecified
# To REITERATE: just because the Poisson model predictors are all significant does not in iteself indicated the model is well fitted. Check the dispersion statistic!

# Z-Score Test
# This test is based on two assumptions 1) data set is large 2) z is t-distributed
# The below example indicates the null hypothesis of no overdispersion is rejected (likely overdispersion is real)
library(COUNT)
data(rwm5yr)
rwm1984 = subset(rwm5yr, year==1984)
poi = glm(docvis ~ outwork + age, family=poisson, data=rwm1984)
mu = predict(poi, type="response")
z = ((rwm1984$docvis - mu)^2 - rwm1984$docvis)/(mu * sqrt(2))
zscore = lm(z ~ 1)
summary(zscore)

#### Methods of Handling Overdispersion ###########################
## Scaling Standard Errors: Quasi-Count Models
# Process is:
# 1 estimate model
# 2 calculate pearson dispersion stat
# 3 multiply SEs by sqrt(dispersion stat)
# 4 iterate model again

# Demonstrate the above manually and using R's quasipoisson option
data(medpar)
glimpse(medpar)
poi = glm(los ~ hmo + white + factor(type), family=poisson, data=medpar)
summary(poi)
confint(poi)
# profile confidence interval
pr = sum(residuals(poi, type="pearson")^2)      # Pearson stat
dispersion = pr/df.residual(poi) ; dispersion   # Dispersion stat
sse = sqrt(diag(vcov(poi))) * sqrt(dispersion)  # Model SE = SE * sqrt(disp)
# OR
poiQL = glm(los ~ hmo + white + factor(type), family=quasipoisson, data=medpar)
coef(poiQL); confint(poiQL)      # coeff and scaled SEs
# IRRs can be produced by exponentiating the model coefficients and appropriately adjusting associated statistics
exp(coef(poiQL))
# Interpretation of IRRs:
# 1 HMO members have 7% few days in hospital holding other predictors at their mean
# 2 White patients have 15% few days in hospital than nonwhites holding....
# 3 Urgent admissions stay 25% longer than elective admissions holding....
# 3 Emergency admissions stay twice as long as elective admissions holding...
# NOTE: model parameters remain unaffected when SEs are scaled - confidence intervals are...
modelfit(poi)

## Quasi-Liklihood Models
poiQL = glm(los ~ hmo + white + type2 + type3, family=poisson, data=medpar)
summary(poiQL)
pr = sum(residuals(poiQL,type="pearson")^2)
disp = pr/poiQL$df.residual ;disp
se = sqrt(diag(vcov(poiQL))) ; se
QLse = se/sqrt(disp) ; QLse
# ? How do i now recalculate the dispersion statistic?

## Sandwich or Robust Variance Estimators
# The foremost function is to adjust standard errors for data that are not independent
library(sandwich)
poi <- glm(los ~ hmo + white+factor(type), family=poisson, data=medpar)
vcovHC(poi)
# just for fun, compare the covaranceHC matrix with the standard covariance matrix
vcov(poi)
vcovHC(poi, type="const")
### TO be cont'd --- need to get on with Assessment of Fit

############ Assessment of Fit ##############################################
# How are residuals analyzed for count data (differnt than other models)?
# What are we looking for in graphing residuals for count models?
# 1 Evidence of poor fit
# 2 Nonrandom patterns
rwm <- glm(docvis ~ outwork + cage, family=poisson,data=rwm1984)
summary(rwm)
presid <- residuals(rwm, type="pearson")
respon <- residuals(rwm,type="response")
P__disp(rwm)
mu <- predict(rwm)
par(mfrow=c(1,2))
plot(x=mu,y=respon, main="Response residuals")
plot(x=mu, y=presid, main="Pearson residuals")

############# Standard Likelihood Ratio Test ####################################
# The likelihood ration drop1 test is useful in which one predictor at a time is dropped and the models are checked in turn for comparative goodness-of-fit.  In R, drop1 uses the deviance statistic rather than the log-likelihood as the basis for model comparison. But recall that the deviance is itself derived from the likelihood ratio.
# The likelihood ratio test rejects the null hypothesis if the value of this statistic (p-value) is too small. In the lrtest below, the null hypothesis is rejected (models are equivalent), and age is determined to be a signficant variable


library(COUNT); library(lmtest); data(rwm5yr)
rwm1984 <- subset(rwm5yr, year==1984)
poi1 <- glm(docvis ~ outwork + age, family=poisson, data=rwm1984)
poi1a <- glm(docvis ~ outwork, family=poisson, data=rwm1984)
lrtest(poi1,poi1a)
# or below where we see that p-value of 2.2e-6 indicates that outwork and age are signficant predictors in the model
drop1(poi1, test="Chisq")

# Note: the drop1 test is used by many analysts as a substitute for standard likelihood ratio test