Assignment 2: Case-control analysis via logistic regression (due: 12.00 26.03.2023)

Author

Janko Pavlech

Introduction

Ille-et-Vilaine Data contains the results of a case-control study investigating the effect of alcohol and tobacco consumption on the risk of oesophageal cancer. There are 200 male cases and 775 male controls, all of them inhabitants of the French departement Ille-et-Vilaine. (See [BD1], Sec. 4.1, p. 122–124, reference in course materials).

rm(list=ls())
library(knitr,quietly = TRUE)
library(dplyr,quietly = TRUE)
load("iev.RData")

Question 1

1. Reproduce the descriptive results in Table 4.1 of [BD1], p. 123.

The solution is straightforward.

age<-c("25-34","35-44","45-54","55-64","65-74",">=75")
As<-rep(0,length(age))
for (i in 1:length(age)){
 As[i]<-sum(iev$case[iev$agegr==age[i]])
}
Acs<-rep(0,length(age))
for (i in 1:length(age)){
  Acs[i]<-sum(iev$case[iev$agegr==age[i]]==0)
}


Am<-mean(iev$agediag[iev$case==1])
Asd<-sd(iev$agediag[iev$case==1])

Acm<-mean(iev$agediag[iev$case==0])
Acsd<-sd(iev$agediag[iev$case==0])

C1<-c(As,Am,Asd)
C2<-c(Acs,Acm,Acsd)
TAB<-data.frame(C1,C2)

TAB <- rbind(formatC(as.matrix(TAB[1:6,]),format="d"), 
             formatC(as.matrix(TAB[7:8,]),format="f",digits=1))

colnames(TAB)=c("Cases","Control")
rownames(TAB)=c("25-34","35-44","45-54","55-64","65-74","75+","Mean","S.D")
knitr::kable(TAB,caption = "Distribution of risk factors for cases and controls: Ille-et-Vilaine study of oesophageal cancer (Age [years]).")
Distribution of risk factors for cases and controls: Ille-et-Vilaine study of oesophageal cancer (Age [years]).
Cases Control
25-34 1 115
35-44 9 190
45-54 46 167
55-64 76 166
65-74 55 106
75+ 13 31
Mean 60.0 50.2
S.D 9.2 14.2

Average age of patients with cancer in this data set is \(60.0\) years which is higher than for patients without cancer (\(50.2\) years). The number of patients with cancer is highest for age group \(55-64\) years.

Als<-rep(0,4)
Als[1]<-sum(iev$case[iev$alctot<=39])
Als[2]<-sum(iev$case[iev$alctot>39 & iev$alctot<=79])
Als[3]<-sum(iev$case[iev$alctot>79 & iev$alctot<=119])
Als[4]<-sum(iev$case[iev$alctot>119])

Alcs<-rep(0,4)
Alcs[1]<-sum(iev$case[iev$alctot<=39]==0)
Alcs[2]<-sum(iev$case[iev$alctot>39 & iev$alctot<=79]==0)
Alcs[3]<-sum(iev$case[iev$alctot>79 & iev$alctot<=119]==0)
Alcs[4]<-sum(iev$case[iev$alctot>119]==0)


Alm<-mean(iev$alctot[iev$case==1])
Alsd<-sd(iev$alctot[iev$case==1])

Alcm<-mean(iev$alctot[iev$case==0])
Alcsd<-sd(iev$alctot[iev$case==0])

C1<-c(Als,Alm,Alsd)
C2<-c(Alcs,Alcm,Alcsd)
TAB<-data.frame(C1,C2)

TAB <- rbind(formatC(as.matrix(TAB[1:4,]),format="d"),    
             formatC(as.matrix(TAB[5:6,]),format="f",digits=1))

colnames(TAB)=c("Cases","Control")
rownames(TAB)=c("0-39","40-79","80-119","120+","Mean","S.D")
knitr::kable(TAB,caption = "Distribution of risk factors for cases and controls: Ille-et-Vilaine study of oesophageal cancer (Alcohol [g/day]).")
Distribution of risk factors for cases and controls: Ille-et-Vilaine study of oesophageal cancer (Alcohol [g/day]).
Cases Control
0-39 29 385
40-79 75 280
80-119 51 88
120+ 45 22
Mean 85.1 44.4
S.D 48.5 31.9

In this data set, the average consumption of alcohol is almost two times higher for patients with cancer (\(85.1\) g/day) than for patients without cancer (\(44.4\) g/day).

Ts<-rep(0,4)
Ts[1]<-sum(iev$case[iev$tob<=9])
Ts[2]<-sum(iev$case[iev$tob>9 & iev$tob<=19])
Ts[3]<-sum(iev$case[iev$tob>19 & iev$tob<=29])
Ts[4]<-sum(iev$case[iev$tob>29])

Tcs<-rep(0,4)
Tcs[1]<-sum(iev$case[iev$tob<=9]==0)
Tcs[2]<-sum(iev$case[iev$tob>9 & iev$tob<=19]==0)
Tcs[3]<-sum(iev$case[iev$tob>19 & iev$tob<=29]==0)
Tcs[4]<-sum(iev$case[iev$tob>29]==0)


Tm<-mean(iev$tob[iev$case==1])
Tsd<-sd(iev$tob[iev$case==1])

Tcm<-mean(iev$tob[iev$case==0])
Tcsd<-sd(iev$tob[iev$case==0])




C1<-c(Ts,Tm,Tsd)
C2<-c(Tcs,Tcm,Tcsd)
TAB<-data.frame(C1,C2)

TAB <- rbind(formatC(as.matrix(TAB[1:4,]),format="d"),
             formatC(as.matrix(TAB[5:6,]),format="f",digits=1))

colnames(TAB)=c("Cases","Control")
rownames(TAB)=c("0-9","10-19","20-29","30+","Mean","S.D")
knitr::kable(TAB,caption = "Distribution of risk factors for cases and controls: Ille-et-Vilaine study of oesophageal cancer (tobacco [g/day]).")
Distribution of risk factors for cases and controls: Ille-et-Vilaine study of oesophageal cancer (tobacco [g/day]).
Cases Control
0-9 78 448
10-19 58 178
20-29 33 98
30+ 31 51
Mean 16.7 10.5
S.D 12.9 11.9

Also the tobacco consumption is on average higher for patients with cancer (\(16.7\) g/day) than fo patients without cancer (\(10.5\) g/day).

Question 2

2. Conduct a grouped analysis of alcohol risk adjusted for age (see [BD1], p. 210–213).

We split the data into six groups by age. With exposure to alcohol (more than 79 g/day).

DATA<-iev
DATA$strata<-as.factor(as.numeric(DATA$agegr))
DATA$exp<-ifelse(DATA$alctot>79,1,0)


data<-DATA[,c(1,12,13)]

knitr::kable(TAB<-table(data$case,data$exp,data$strata),col.names = c("Case","Exposure","Age stratum","Number of patients"),caption = "Reorganized data.",align = "c")
Reorganized data.
Case Exposure Age stratum Number of patients
0 0 1 106
1 0 1 0
0 1 1 9
1 1 1 1
0 0 2 164
1 0 2 5
0 1 2 26
1 1 2 4
0 0 3 138
1 0 3 21
0 1 3 29
1 1 3 25
0 0 4 138
1 0 4 34
0 1 4 28
1 1 4 42
0 0 5 88
1 0 5 36
0 1 5 18
1 1 5 19
0 0 6 31
1 0 6 8
0 1 6 0
1 1 6 5

We will fit three models and choose the best one. The first model is intercept only \[ \text{logit}P_i=\alpha_i. \] The second model consider effect of alcohol \[ \text{logit}P_i=\alpha_i+\beta\cdot Alcohol. \] And the third model with interaction of alcohol and age shifted by average stratum. \[ \text{logit}P_i=\alpha_i+\beta\cdot Alcohol+\gamma\cdot Alcohol\cdot (i-3.5). \]

Estimates of these three models can be seen in the table below. We also calculate goodness of fit statistics (GOF). We have to note that as we have zeroes in the Number of patients in the previous table, we removed logarithms of zero in sum of Log-likelihood .

#functions for goodnes of fit statistics
Goodv<-function(TAB=TAB,ai,bi,ci,di){
  SUM=0
  for (i in 1:length(ai)){
    SUM=SUM+(TAB[2,2,i]-ai[i])^2*(1/ai[i]+1/bi[i]+1/ci[i]+1/di[i])
  }
  SUM
}

Good<-function(TAB=TAB,ai,bi,ci,di){
  #for log(0):=0 (in sum) 
  logg<-function(x){
    ifelse(x==0,0,log(x))
  }
  SUM=0
  for (i in 1:length(ai)){
    SUM=SUM+logg(TAB[2,2,i]/ai[i])*(TAB[2,2,i])+
           logg(TAB[2,1,i]/bi[i])*(TAB[2,1,i])+
           logg(TAB[1,2,i]/ci[i])*(TAB[1,2,i])+
           logg(TAB[1,1,i]/di[i])*(TAB[1,1,i])
  }
  2*SUM
}
pval<-function(chi,df,dig=3){
  p=1-pchisq(chi,df=df)
  ifelse(p<0.001,"<0.001",formatC(p,format="f",digits=dig))
}
#-------------------------------------
#total numbers Exposure/Unexposed
m1i=rep(0,6)
m0i=rep(0,6)
for (i in 1:6){
  m0i[i]=sum(TAB[,1,i])
  m1i[i]=sum(TAB[,2,i])
}
#-------------------------------------
#First model

fit1 <- glm(case ~ strata, family = binomial, data = data)
#summary(fit1)  

alpha1<-unname(c(coef(fit1)[1],coef(fit1)[1]+coef(fit1)[2:6]))

P1i_1<-(exp(alpha1))/(1+exp(alpha1))

P0i_1<-(exp(alpha1))/(1+exp(alpha1))

ai1=m1i*P1i_1

bi1=m0i*P0i_1

ci1=m1i-ai1

di1=m0i-bi1

G1<-Good(TAB,ai1,bi1,ci1,di1)
Gv1<-Goodv(TAB,ai1,bi1,ci1,di1)

#-------------------------------------
#Second model

fit2 <- glm(case ~ strata+exp, family = binomial, data = data)
#summary(fit2)  


alpha2<-unname(c(coef(fit2)[1],coef(fit2)[1]+coef(fit2)[2:6]))
beta2<-unname(coef(fit2)[7])
beta2_sd<-summary(fit2)$coefficients[7,2]

P1i_2<-(exp(alpha2+beta2))/(1+exp(alpha2+beta2))

P0i_2<-(exp(alpha2))/(1+exp(alpha2))

ai2=m1i*P1i_2

bi2=m0i*P0i_2

ci2=m1i-ai2

di2=m0i-bi2

G2<-Good(TAB,ai2,bi2,ci2,di2)
Gv2<-Goodv(TAB,ai2,bi2,ci2,di2)

#-------------------------------------
#Third model

data$strata2<-as.numeric(data$strata)-1

fit3 <- glm(case ~ strata+exp+exp:I(strata2-3.5), family = binomial, data = data)
#summary(fit3)  

#model.matrix(fit3)
## interaction has opposite sign than in the article, 
## but I do not see mistake here

alpha3<-unname(c(coef(fit3)[1],coef(fit3)[1]+coef(fit3)[2:6]))
beta3<-unname(coef(fit3)[7])
beta3_sd<-summary(fit3)$coefficients[7,2]

gamma3<-unname(coef(fit3)[8])
gamma3_sd<-summary(fit3)$coefficients[8,2]

P1i_3<-(exp(alpha3+beta3+gamma3*c(1:6-3.5)))/
       (1+exp(alpha3+beta3+gamma3*c(1:6-3.5)))

P0i_3<-(exp(alpha3))/(1+exp(alpha3))

ai3=m1i*P1i_3

bi3=m0i*P0i_3

ci3=m1i-ai3

di3=m0i-bi3

G3<-Good(TAB,ai3,bi3,ci3,di3)
Gv3<-Goodv(TAB,ai3,bi3,ci3,di3)


#-------------------------------------
#creating table

r1=c("1","6","6",formatC(c(G1,Gv1),format="f",digits=2),
     formatC(alpha1,format="f",digits=3),"-","-")
r2=c("2","7","5",formatC(c(G2,Gv2),format="f",digits=2),
     formatC(alpha2,format="f",digits=3),
     paste(formatC(beta2,format="f",digits=3),"±",
           formatC(beta2_sd,format="f",digits=3),sep=""),"-")
r3=c("3","8","4",formatC(c(G3,Gv3),format="f",digits=2),
     formatC(alpha3,format="f",digits=3),
     paste(formatC(beta3,format="f",digits=3),"±",
           formatC(beta3_sd,format="f",digits=3),sep=""),
     paste(formatC(gamma3,format="f",digits=3),"±",
           formatC(gamma3_sd,format="f",digits=3),sep=""))

TAB6.2 <- rbind(r1,r2,r3)

colnames(TAB6.2)=c("","par","DF","GOF \\n Log-like.","Chi2",
                   'Age 25-34',"35-44","45-54","55-64","65-74","75+",
                   "Alcohol","Alcohol x age")
rownames(TAB6.2)=c("","","")
knitr::kable(TAB6.2,caption = "Results of fitting several versions 
             of the linear logistic model.",align="c")
Results of fitting several versions of the linear logistic model.
par DF GOF Log-like. Chi2 Age 25-34 35-44 45-54 55-64 65-74 75+ Alcohol Alcohol x age
1 6 6 89.15 100.25 -4.745 -3.050 -1.289 -0.781 -0.656 -0.869 - -
2 7 5 10.89 9.16 -5.049 -3.505 -1.849 -1.343 -1.083 -1.090 1.654±0.189 -
3 8 4 11.17 9.43 -5.179 -3.611 -1.895 -1.336 -1.044 -1.052 1.572±0.224 -0.127±0.189
#pvalues
pG=c(pval(G1,df=6),
     pval(G2,df=5),
     pval(G3,df=4))

pGv=c(pval(Gv1,df=6),
      pval(Gv2,df=5),
      pval(Gv3,df=4))
TABp<-cbind(c("1","2","3"),
            pG,
            pGv)

colnames(TABp)=c("Model","Log-like.","Chi2")
rownames(TABp)=c("1","2","3")
knitr::kable(TABp,caption = "p-values for several versions 
             of the linear logistic model.",align="c")
p-values for several versions of the linear logistic model.
Model Log-like. Chi2
1 <0.001 <0.001
2 0.054 0.103
3 0.025 0.051

Considering the p-values, we decide between second and third model. However, the third model has difficult interpretation of interaction, therefore, we will consider the second model with linear effect of alcohol.


We can see that estimated effect of alcohol has positive value \(\beta=1.654\). The corresponding relative risk of drinking more than \(79\) g/day of alcohol, is \(\text{exp}(\beta)=5.228\). With 95% confidence interval \((3.609,\; 7.572)\). We will also calculate expected counts and compare them with observed ones (see table below).

#residual table for model 2
c1<-c("25-34","","35-44","","45-54","","55-64","","65-74","","75+","")
c2<-formatC(rep(c(0,1),6),format="d")

#tric to zip vectors
c3<-as.vector(matrix(c(formatC(TAB[2,2,],format="d"),
                    formatC(TAB[2,1,],format="d")),nrow=2,byrow=TRUE))
c4<-as.vector(matrix(c(formatC(ai2,format="f",digits=2),
                        formatC(bi2,format="f",digits=2)),nrow=2,byrow=TRUE))

v1=P1i_2*(1-P1i_2)*m1i
v2=P0i_2*(1-P0i_2)*m0i

c5<-as.vector(matrix(c(formatC(v1,format="f",digits=2),
                       formatC(v2,format="f",digits=2)),nrow=2,byrow=TRUE))
c6<-as.vector(matrix(c(formatC((TAB[2,2,]-ai2)/sqrt(v1),format="f",digits=2),
                       formatC((TAB[2,1,]-bi2)/sqrt(v2),format="f",digits=2)),
                     nrow=2,byrow=TRUE))

TABres<-cbind(c1,c2,c3,c4,c5,c6)
colnames(TABres)=c("Age straturm","Exposure","Observed",
                   "Expected","Variance","Standardized residuals")
rownames(TABres)=c("","","",
                   "","","",
                   "","","",
                   "","","")
knitr::kable(TABres,caption = "Residuals from fitting model 2.",align="c")
Residuals from fitting model 2.
Age straturm Exposure Observed Expected Variance Standardized residuals
25-34 0 1 0.32 0.31 1.21
1 0 0.68 0.67 -0.82
35-44 0 4 4.07 3.52 -0.04
1 5 4.93 4.78 0.03
45-54 0 25 24.38 13.37 0.17
1 21 21.62 18.68 -0.14
55-64 0 42 40.40 17.08 0.39
1 34 35.60 28.23 -0.30
65-74 0 19 23.64 8.54 -1.59
1 36 31.36 23.43 0.96
75+ 0 5 3.19 1.16 1.69
1 8 9.81 7.34 -0.67

Question 3

3. Conduct a joint grouped analysis of alcohol and tobacco risk adjusted for age (see [BD1], p. 213–221, esp. Tables 6.5 and 6.6).

Firstly, we replicate the table 6.4 which is parameterzation of alcohol and tobacco together with interactions. But, we will show only the first 6 observation (not every combination of alcohol and tobacco).

x1<-ifelse(DATA$alctot>40 & DATA$alctot<=80,1,0)
x2<-ifelse(DATA$alctot>80 & DATA$alctot<=120,1,0)
x3<-ifelse(DATA$alctot>120,1,0)

x4<-ifelse(DATA$tob>10 & DATA$tob<=20,1,0)
x5<-ifelse(DATA$tob>20 & DATA$tob<=30,1,0)
x6<-ifelse(DATA$tob>30,1,0)

x7<-x1*x4
x8<-x1*x5
x9<-x1*x6

x10<-x2*x4
x11<-x2*x5
x12<-x2*x6

x13<-x3*x4
x14<-x3*x5
x15<-x3*x6

datax<-data.frame(age=DATA$agegr,case=DATA$case,x1,x2,x3,x4,x5,
             x6,x7,x8,x9,x10,
             x11,x12,x13,x14,x15)
knitr::kable(datax[1:6,],caption = "Calues of qualitative risk variables for the first 6 of data records: Ille-etVilaine study of oesophageal cancer.",align="c")
Calues of qualitative risk variables for the first 6 of data records: Ille-etVilaine study of oesophageal cancer.
age case x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 x11 x12 x13 x14 x15
35-44 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
45-54 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
35-44 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
>=75 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
45-54 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
55-64 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0

The first three parameters \(x1\), \(x2\), and \(x3\) are used for parameterization of Alcohol, next three of them \(x4\), \(x5\), and \(x6\) are used for parameterization of tobacco, and remaining parameters are used for parameterization of interactions (but they are not used at all).

Now, we will fit four models and calculate the goodness of fit for each of them.

dataTAB<-datax[,c(1,2)]
dataTAB$alc<-ifelse(DATA$alctot<=40,1,
                    ifelse(DATA$alctot>40 & DATA$alctot<=80,2,
                           ifelse(DATA$alctot>80 & DATA$alctot<=120,3,
                                  ifelse(DATA$alctot>120,4,0))))




dataTAB$tob<-ifelse(DATA$tob<=10,1,
                    ifelse(DATA$tob>10 & DATA$tob<=20,2,
                           ifelse(DATA$tob>20 & DATA$tob<=30,3,
                                  ifelse(DATA$tob>30,4,0))))
#observed counts total
TABO<-table(dataTAB$alc,dataTAB$tob,dataTAB$age)

dataTAB1<-dataTAB[dataTAB$case==1,]

#observed counts cases
TABO1<-table(dataTAB1$alc,dataTAB1$tob,dataTAB1$age)

#xxxxxxxxxxxxxxxxxxxx
# Model 1
#xxxxxxxxxxxxxxxxxxxx
fitx1 <- glm(case ~-1+ age, family = binomial, data = datax)
#summary(fitx1)

alphax1<-unname(coef(fitx1)[1:6])

P1i_x1<-(exp(alphax1))/(1+exp(alphax1))

### we will create tables of expected counts
# TABE -expected cases
TABE<-TABO1
# TABE -expected controls
TABE0<-TABO1
for (i in 1:6){
  TABE[,,i]<-TABO[,,i]*P1i_x1[i]
  TABE0[,,i]<-TABO[,,i]*(1-P1i_x1[i])
}

#goodnes of fit functions are implemented here, 
#we will use only log-like.(Goodx())
Goodx<-function(TABE,TABE0,TABO1=TABO1){
  logg<-function(x){
    ifelse(x==0,0,log(x))
  }
  SUM=0
  for (i in 1:length(TABO1[1,1,])){
    for (j in 1:length(TABO1[,1,1])){
      for (k in 1:length(TABO1[1,,1])){
        if (TABO[j,k,i] !=0){
          SUM=SUM+logg(TABO1[j,k,i]/TABE[j,k,i])*TABO1[j,k,i]+
                 logg((TABO[j,k,i]-TABO1[j,k,i])/TABE0[j,k,i])*(TABO[j,k,i]-TABO1[j,k,i])
          }
        
      }
    }
  }
  2*SUM
}

Goodvx<-function(TABE,TABE0,TABO1=TABO1){
  SUM=0
  for (i in 1:length(TABO1[1,1,])){
    for (j in 1:length(TABO1[,1,1])){
      for (k in 1:length(TABO1[1,,1])){
        if (TABO[j,k,i] !=0){
          SUM=SUM+(TABO1[j,k,i]-TABE[j,k,i])^2/TABE[j,k,i]+
                 (TABO[j,k,i]-TABO1[j,k,i]-TABE0[j,k,i])^2/TABE0[j,k,i]
          }
        
      }
    }
  }
  SUM
}
#Goodvx(TABE,TABE0,TABO1)
Gx1<-Goodx(TABE,TABE0,TABO1)


#xxxxxxxxxxxxxxxxxxxx
# Model 2
#xxxxxxxxxxxxxxxxxxxx
fitx2 <- glm(case ~ -1+age+x1+x2+x3, family = binomial, data = datax)
#summary(fitx2)

alphax2<-unname(coef(fitx2)[1:6])
beta2<-unname(coef(fitx2)[7:9])

P1i_x2<-(exp(alphax2))/(1+exp(alphax2))
P2i_x2<-(exp(alphax2+beta2[1]))/(1+exp(alphax2+beta2[1]))
P3i_x2<-(exp(alphax2+beta2[2]))/(1+exp(alphax2+beta2[2]))
P4i_x2<-(exp(alphax2+beta2[3]))/(1+exp(alphax2+beta2[3]))
Pi_x2<-rbind(P1i_x2,P2i_x2,P3i_x2,P4i_x2)
TABE<-TABO1
TABE0<-TABO1
for (i in 1:6){
  for (j in 1:4){
      TABE[j,,i]<-TABO[j,,i]*Pi_x2[j,i]
      TABE0[j,,i]<-TABO[j,,i]*(1-Pi_x2[j,i])
  }

}
TABEl2<-TABE
TABEl02<-TABE0
# Goodvx(TABE,TABE0,TABO1)
Gx2<-Goodx(TABE,TABE0,TABO1)


#xxxxxxxxxxxxxxxxxxxx
# Model 3
#xxxxxxxxxxxxxxxxxxxx
fitx3 <- glm(case ~ -1+age+x4+x5+x6, family = binomial, data = datax)
#summary(fitx3)

alphax3<-unname(coef(fitx3)[1:6])
betax3<-unname(coef(fitx3)[7:9])

P1i_x3<-(exp(alphax3))/(1+exp(alphax3))
P2i_x3<-(exp(alphax3+betax3[1]))/(1+exp(alphax3+betax3[1]))
P3i_x3<-(exp(alphax3+betax3[2]))/(1+exp(alphax3+betax3[2]))
P4i_x3<-(exp(alphax3+betax3[3]))/(1+exp(alphax3+betax3[3]))
Pi_x3<-rbind(P1i_x3,P2i_x3,P3i_x3,P4i_x3)
TABE<-TABO1
TABE0<-TABO1
for (i in 1:6){
  for (j in 1:4){
      TABE[,j,i]<-TABO[,j,i]*Pi_x3[j,i]
      TABE0[,j,i]<-TABO[,j,i]*(1-Pi_x3[j,i])
  }

}
# Goodvx(TABE,TABE0,TABO1)
Gx3<-Goodx(TABE,TABE0,TABO1)

#xxxxxxxxxxxxxxxxxxxx
# Model 4
#xxxxxxxxxxxxxxxxxxxx
fitx4 <- glm(case ~ -1+age+x1+x2+x3+x4+x5+x6, 
             family = binomial, data = datax)
#summary(fitx4)

alphax4<-unname(coef(fitx4)[1:6])

#trick to add 0 at the beginning
betax4<-c(0,unname(coef(fitx4)[7:9]))
gammax4<-c(0,unname(coef(fitx4)[10:12]))

TABP<-TABO
for (i in 1:6){
  for (j in 1:4){
    for (k in 1:4){
      TABP[j,k,i]<-(exp(alphax4[i]+betax4[j]+gammax4[k]))/(1+exp(alphax4[i]+betax4[j]+gammax4[k]))
    }
  }
}

TABE<-TABO*TABP
TABE0<-TABO*(1-TABP)

#Goodvx(TABE,TABE0,TABO1)
Gx4<-Goodx(TABE,TABE0,TABO1)
c0<-c("1","","2","","3","","4","","")
c1<-c("Age","","Age "," Alcohol (1-3)","Age","tobacco (4-6)","Age","Alcohol (1-3)","tobacco (4-6)")
c2<-c("6","","9","","9","","12","","")
c3<-c("82","","79","","79","","76","","")
c4<-c(formatC(Gx1,format="f",digits=1),"",
      formatC(Gx2,format="f",digits=1),"",
      formatC(Gx3,format="f",digits=1),"",
      formatC(Gx4,format="f",digits=1),"","")
c5<-c("No effect of alcohol or","tobacco",
      "Effect of alcohol only,","adjusted for age",
      "Effect of tobacco only,","adjusted for age",
      "Main effects for alcohol and","tobacco (multiplicative hypo-",
      "thesis), adjusted for age")
TABout<-data.frame(c0,c1,c2,c3,c4,c5)
colnames(TABout)<-c("Model","Regression variables included",
                    "No. of parameters","DF","Goodness of fit G",
                    "Hypothesis tested and/or interpretation")


kableExtra::kable(TABout,caption = "Summary of goodness of fit of several logistic regression models: grouped data from the 
Ille-et-Vilaine study of oesophageal cancer.",align="c")%>% 
  kableExtra::row_spec(c(1,2,5,6), background = "#F2F2F2")%>%
  kableExtra::row_spec(0, extra_css = "border-bottom: 1px solid")
Summary of goodness of fit of several logistic regression models: grouped data from the Ille-et-Vilaine study of oesophageal cancer.
Model Regression variables included No. of parameters DF Goodness of fit G Hypothesis tested and/or interpretation
1 Age 6 82 255.5 No effect of alcohol or
tobacco
2 Age 9 79 104.5 Effect of alcohol only,
Alcohol (1-3) adjusted for age
3 Age 9 79 218.9 Effect of tobacco only,
tobacco (4-6) adjusted for age
4 Age 12 76 79.7 Main effects for alcohol and
Alcohol (1-3) tobacco (multiplicative hypo-
tobacco (4-6) thesis), adjusted for age

First of all, we will compare this models hierarchically

c1<-c("1 vs 2","1 vs 3","2 vs 4","3 vs 4")
c2<-c("3","3","3","3")
c3<-c(formatC(Gx1-Gx2,format="f",digits=1),
      formatC(Gx1-Gx3,format="f",digits=1),
      formatC(Gx2-Gx4,format="f",digits=1),
      formatC(Gx3-Gx4,format="f",digits=1))
c4<-c(pval((Gx1-Gx2),3),
      pval((Gx1-Gx3),3),
      pval((Gx2-Gx4),3),
      pval((Gx3-Gx4),3))
TABout<-data.frame(c1,c2,c3,c4)
colnames(TABout)<-c("Model comparison","DF",
                    "(Chi2_submodel-Chi2)","p-value")


knitr::kable(TABout,caption = "p-values, comparison of submodels.",align="c",format = "simple")
p-values, comparison of submodels.
Model comparison DF (Chi2_submodel-Chi2) p-value
1 vs 2 3 151.0 <0.001
1 vs 3 3 36.6 <0.001
2 vs 4 3 24.7 <0.001
3 vs 4 3 139.1 <0.001

We always reject the submodel, therefore, we consider model 4 as the most appropriate. To summarize the estimates of this model, we show all relative risks (relative to abstinent non smoker) in table below. As we can see, the relative risk of cancer is increasing with amount of alcohol and tobacco.

TABP1<-TABP[,,2]
for (j in 1:4){
    for (k in 1:4){
      TABP1[j,k]<-(exp(betax4[j]+gammax4[k]))
    }
}
c1<-c("0-39","40-79","80-119","120+")
TABP1out<-cbind(c1,formatC(TABP1,format="f",digits=1))
colnames(TABP1out)=c("Alcohol (g/day)","Tabaco (g/day) <br> 0-9","10-19","20-29","30+")
rownames(TABP1out)=c("","","","")
knitr::kable(TABP1out,caption = "Age-adjusted relative risks for each alcohol/tobacco category according to multiplicative model: Ille-et-Vilaine oesophageal cancer study.",align="c",format = "simple")
Age-adjusted relative risks for each alcohol/tobacco category according to multiplicative model: Ille-et-Vilaine oesophageal cancer study.
Alcohol (g/day) Tabaco (g/day)
0-9
10-19 20-29 30+
0-39 1.0 1.6 1.7 5.5
40-79 4.8 7.7 8.1 26.6
80-119 7.3 11.7 12.3 40.3
120+ 48.0 76.9 80.8 264.4

And also the comparison of observed and expected counts for each combination of alcohol and tobacco group is provided.

totobs<-TABO1[,,1]+TABO1[,,2]+TABO1[,,3]+
        TABO1[,,4]+TABO1[,,5]+TABO1[,,6]

totexp<-TABE[,,1]+TABE[,,2]+TABE[,,3]+
        TABE[,,4]+TABE[,,5]+TABE[,,6]
c1<-c("0-39","40-79","80-119","120+")



TABout<-data.frame(c1,formatC(totobs[,1],format="f",digits=0),
                 formatC(totexp[,1],format="f",digits=1),
                 formatC(totobs[,2],format="f",digits=0),
                 formatC(totexp[,2],format="f",digits=1),
                 formatC(totobs[,3],format="f",digits=0),
                 formatC(totexp[,3],format="f",digits=1),
                 formatC(totobs[,4],format="f",digits=0),
                 formatC(totexp[,4],format="f",digits=1))
colnames(TABout)<-c("Alcohol <br> (g/day) <br> ","tobacco <br> (g/day)<br> 0-9<br> O","E","10-19 <br> O","E","20-29 <br> O","E","30+ <br> O","E")
knitr::kable(TABout,caption = "Observed and expected (age-adjusted) numbers
of cases for each alcohol/tobacco category according
to multiplicative model: Ille-et-Vilaine oesophageal cancer study.",align="c",format = "simple")
Observed and expected (age-adjusted) numbers of cases for each alcohol/tobacco category according to multiplicative model: Ille-et-Vilaine oesophageal cancer study.
Alcohol
(g/day)
tobacco
(g/day)
0-9
O
E 10-19
O
E 20-29
O
E 30+
O
E
0-39 9 13.0 10 7.3 5 3.4 5 5.2
40-79 34 32.2 18 19.0 15 16.3 9 8.5
80-119 19 17.3 18 19.2 6 6.9 7 6.6
120+ 16 15.5 12 12.4 7 6.4 10 10.7

Question 4

4. Conduct a joint ungrouped analysis of alcohol and tobacco risk adjusted for age (see [BD1], p. 227–231, esp. Table 6.12).

The data for alcohol and tobacco were originally collected as continuous. Therefore, we can extract more information if we build model with continuous covariates. We fit seven different models with different parameterizations (see table below).

datac<-iev[,c(1,3,5,11)]
datac$logalc<-log(datac$alctot+1)
datac$logtob<-log(datac$tob+1)

#xxxxxxxxxxxxxxxxxxxx
# Model 1
#xxxxxxxxxxxxxxxxxxxx
fitc1 <- glm(case ~agegr+alctot+tob, family = binomial, data = datac)
#summary(fitc1)

#xxxxxxxxxxxxxxxxxxxx
# Model 2
#xxxxxxxxxxxxxxxxxxxx
fitc2 <- glm(case ~agegr+tob+logalc, family = binomial, data = datac)
#summary(fitc2)

#xxxxxxxxxxxxxxxxxxxx
# Model 3
#xxxxxxxxxxxxxxxxxxxx
fitc3 <- glm(case ~agegr+alctot+logtob, family = binomial, data = datac)
#summary(fitc3)

#xxxxxxxxxxxxxxxxxxxx
# Model 4
#xxxxxxxxxxxxxxxxxxxx
fitc4 <- glm(case ~agegr+logalc+logtob, family = binomial, data = datac)
#summary(fitc4)

#xxxxxxxxxxxxxxxxxxxx
# Model 5
#xxxxxxxxxxxxxxxxxxxx
fitc5 <- glm(case ~agegr+alctot+tob+I(tob^2), family = binomial, 
             data = datac)
#summary(fitc5)

#xxxxxxxxxxxxxxxxxxxx
# Model 6
#xxxxxxxxxxxxxxxxxxxx
fitc6 <- glm(case ~agegr+alctot+logtob+I(alctot^2), family = binomial, 
             data = datac)
#summary(fitc6)


#xxxxxxxxxxxxxxxxxxxx
# Model 7
#xxxxxxxxxxxxxxxxxxxx
fitc7 <- glm(case ~agegr+alctot+logtob+I(logtob^2), family = binomial, 
             data = datac)
#summary(fitc7)




c1<-c("1","","2","","3","",
      "4","","5","","6","",
      "7","")
c2<-c("8","","8","","8","",
      "8","","9","","9","",
      "9","")

c3<-c("967","","967","","967","",
      "967","","966","","966","",
      "966","")
c4<-c(formatC(fitc1$deviance,format="f",digits=1),"",
      formatC(fitc2$deviance,format="f",digits=1),"",
      formatC(fitc3$deviance,format="f",digits=1),"",
      formatC(fitc4$deviance,format="f",digits=1),"",
      formatC(fitc5$deviance,format="f",digits=1),"",
      formatC(fitc6$deviance,format="f",digits=1),"",
      formatC(fitc7$deviance,format="f",digits=1),"")

# the authors of the article used four different formatting 
# of decimal place in one table, for some reason

c5<-c(formatC(coef(fitc1)[7],format="f",digits=4),
      paste("(",formatC(summary(fitc1)$coefficients[7,3],format="f",digits=2)
              ,")",sep=""),"","",
      formatC(coef(fitc3)[7],format="f",digits=4),
      paste("(",formatC(summary(fitc3)$coefficients[7,3],format="f",digits=2)
              ,")",sep=""),"","",
      formatC(coef(fitc5)[7],format="f",digits=4),
      paste("(",formatC(summary(fitc5)$coefficients[7,3],format="f",digits=2)
              ,")",sep=""),
      formatC(coef(fitc6)[7],format="f",digits=4),
      paste("(",formatC(summary(fitc6)$coefficients[7,3],format="f",digits=2)
              ,")",sep=""),
      formatC(coef(fitc7)[7],format="f",digits=4),
      paste("(",formatC(summary(fitc7)$coefficients[7,3],format="f",digits=2)
              ,")",sep=""))


c6<-c(formatC(coef(fitc1)[8],format="f",digits=4),
      paste("(",formatC(summary(fitc1)$coefficients[8,3],format="f",digits=2)
              ,")",sep=""),
      formatC(coef(fitc2)[7],format="f",digits=4),
      paste("(",formatC(summary(fitc2)$coefficients[7,3],format="f",digits=2)
              ,")",sep=""),"","","","",
      formatC(coef(fitc5)[8],format="f",digits=4),
      paste("(",formatC(summary(fitc5)$coefficients[8,3],format="f",digits=2)
              ,")",sep=""),"","","","")

c7<-c("","",
      formatC(coef(fitc2)[8],format="f",digits=3),
      paste("(",formatC(summary(fitc2)$coefficients[8,3],format="f",digits=2)
              ,")",sep=""),"","",
      formatC(coef(fitc4)[7],format="f",digits=3),
      paste("(",formatC(summary(fitc4)$coefficients[7,3],format="f",digits=2)
              ,")",sep=""),"","","","","","")

c8<-c("","","","",
      formatC(coef(fitc3)[8],format="f",digits=3),
      paste("(",formatC(summary(fitc3)$coefficients[8,3],format="f",digits=2)
              ,")",sep=""),
      formatC(coef(fitc4)[8],format="f",digits=3),
      paste("(",formatC(summary(fitc4)$coefficients[8,3],format="f",digits=2)
              ,")",sep=""),"","",
      formatC(coef(fitc6)[8],format="f",digits=3),
      paste("(",formatC(summary(fitc6)$coefficients[8,3],format="f",digits=2)
              ,")",sep=""),
      formatC(coef(fitc7)[8],format="f",digits=3),
      paste("(",formatC(summary(fitc7)$coefficients[8,3],format="f",digits=2)
              ,")",sep="")
      )

c9<-c("","","","","","","","","","",
      formatC(coef(fitc6)[9],format="f",digits=4),
      paste("(",formatC(summary(fitc6)$coefficients[9,3],format="f",digits=2)
              ,")",sep=""),"","")

c10<-c("","","","","","","","",
      formatC(coef(fitc5)[9],format="f",digits=4),
      paste("(",formatC(summary(fitc5)$coefficients[9,3],format="f",digits=2)
              ,")",sep=""),"","","","")

c11<-c("","","","","","","","","","","","",
      formatC(coef(fitc7)[9],format="f",digits=3),
      paste("(",formatC(summary(fitc7)$coefficients[9,3],format="f",digits=2)
              ,")",sep=""))
TABout<-data.frame(c1,c2,c3,c4,c5,
                   c6,c7,c8,c9,c10,
                   c11)
colnames(TABout)<-c("","No. of <br> par. ","DF<br> ","Goodness <br> of fit <br> G<br> ","Regression coefficicents <br> (standardized coefficients) <br> ALC<br> ","TOB<br> ","LOG <br> (ALC+1)","LOG <br> (TOB+1)","$\\textbf{ALC}^2$<br> ","$\\textbf{TOB}^2$<br> ","$\\textbf{LOG}^2$(TOB+1)<br> ")
knitr::kable(TABout,caption = "Observed and expected (age-adjusted) numbers
of cases for each alcohol/tobacco category according
to multiplicative model: Ille-et-Vilaine oesophageal cancer study.",align="c",format="simple")
Observed and expected (age-adjusted) numbers of cases for each alcohol/tobacco category according to multiplicative model: Ille-et-Vilaine oesophageal cancer study.
No. of
par.
DF
Goodness
of fit
G
Regression coefficicents
(standardized coefficients)
ALC
TOB
LOG
(ALC+1)
LOG
(TOB+1)
\(\textbf{ALC}^2\)
\(\textbf{TOB}^2\)
\(\textbf{LOG}^2\)(TOB+1)
1 8 967 695.5 0.0260 0.0405
(10.01) (5.14)
2 8 967 749.5 0.0411 0.934
(5.47) (7.02)
3 8 967 683.2 0.0252 0.539
(9.67) (5.78)
4 8 967 734.9 0.890 0.556
(6.74) (6.20)
5 9 966 693.9 0.0257 0.0648 -0.0006
(9.88) (3.06) (-1.24)
6 9 966 682.8 0.0202 0.539 0.0000
(2.52) (5.77) (0.65)
7 9 966 681.2 0.0251 0.964 -0.114
(9.62) (3.05) (-1.43)

Again, we compare models with submodels to see which parameterizations is the most apropriate.

c1<-c("1 vs 5","3 vs 6","3 vs 7")
c2<-c("1","1","1")

c3<-c(formatC(fitc1$deviance-fitc5$deviance,format="f",digits=1),
      formatC(fitc3$deviance-fitc6$deviance,format="f",digits=1),
      formatC(fitc3$deviance-fitc7$deviance,format="f",digits=1))

c4<-c(pval(fitc1$deviance-fitc5$deviance,1),
      pval(fitc3$deviance-fitc6$deviance,1),
      pval(fitc3$deviance-fitc7$deviance,1))

TABout<-data.frame(c1,c2,c3,c4)
colnames(TABout)<-c("Model comparison","DF",
                    "Chi2_submodel-Chi2","p-value")


knitr::kable(TABout,caption = "p-values, comparison of submodels.",align="c",format = "simple")
p-values, comparison of submodels.
Model comparison DF Chi2_submodel-Chi2 p-value
1 vs 5 1 1.5 0.213
3 vs 6 1 0.4 0.507
3 vs 7 1 2.1 0.148

As we can see, there is no significant improvement of models 5, 6, and 7 over smaller submodels 1 and 3. We consider the model three as the most appropriate according to smallest goodness of fit value. In this model, the effect of alcohol is considered without transformation and tobacco is used in logarithmic scale. Then, we tried to add interaction of alcohol and logarithm of tobacco (p-value is \(0.457\)), interaction of alcohol and age (p-value is \(0.650\)), and interaction of logarithm of tobacco and age (p-value is \(0.953\)) separately. But, none of these interactions were significant.

#xxxxxxxxxxxxxxxxxxxx
# Model 8
#xxxxxxxxxxxxxxxxxxxx
fitc8 <- glm(case ~agegr+alctot*logtob, family = binomial, data = datac)
#summary(fitc8)

p1<-pval(fitc3$deviance-fitc8$deviance,1)
# [1] "0.457"
#anova(fitc3,fitc8, test = "Chisq") # the same output

#xxxxxxxxxxxxxxxxxxxx
# Model 9
#xxxxxxxxxxxxxxxxxxxx
fitc9 <- glm(case ~agegr*alctot+logtob, family = binomial, data = datac)
#summary(fitc9)

p2<-pval(fitc3$deviance-fitc9$deviance,5)
# [1] "0.650"
#anova(fitc3,fitc9, test = "Chisq") # the same output


#xxxxxxxxxxxxxxxxxxxx
# Model 10
#xxxxxxxxxxxxxxxxxxxx
fitc10 <- glm(case ~alctot+agegr*logtob, family = binomial, data = datac)
#summary(fitc10)

p3<-pval(fitc3$deviance-fitc10$deviance,5)
# [1] "0.953"
#anova(fitc3,fitc10, test = "Chisq") # the same output

c1<-c("ALC x LOG(TOB+1)","ALC x AGE","LOG(TOB+1) x AGE")
c2<-c("1","5","5")
c3<-c(formatC(fitc3$deviance-fitc8$deviance,format="f",digits=1),
      formatC(fitc3$deviance-fitc9$deviance,format="f",digits=1),
      formatC(fitc3$deviance-fitc10$deviance,format="f",digits=1))
c4<-c(p1,p2,p3)

TABout<-data.frame(c1,c2,c3,c4)
colnames(TABout)<-c("Added interaction","DF",
                    "Chi2_submodel-Chi2","p-value")


knitr::kable(TABout,caption = "p-values, comparison of models with interaction and model 3.",align="c",format = "simple")
p-values, comparison of models with interaction and model 3.
Added interaction DF Chi2_submodel-Chi2 p-value
ALC x LOG(TOB+1) 1 0.6 0.457
ALC x AGE 5 3.3 0.650
LOG(TOB+1) x AGE 5 1.1 0.953

If we want to interpret relative risk (relative to abstinent non smoker) of the particular amount of alcohol \(ALC\) and tobacco \(TOB\) both in g/day we would have to calculate \[ e^{\beta_7 ALC+\beta_8 \text{log}(TOB+1)}=e^{0.025\cdot ALC}\cdot(TOB+1)^{0.539}. \]