6  Regression Calibration

library(dplyr)
library(tidyverse)
library(tibble)
library(ggplot2)
library(knitr)
library(hrbrthemes)
library(viridis)
library(broom)
library(boot) # for bootstrap SE

library(devtools)
#using data specified in this github repository:
install_github("jhs-hwg/cardioStatsUSA")
library(cardioStatsUSA)
#Read in the subset data
subset_nhanes <- readRDS("nhanes_subset.rds")

table(subset_nhanes$demo_race)

Non-Hispanic White Non-Hispanic Black Non-Hispanic Asian           Hispanic 
               253                128                 36                155 
             Other 
                28 
table(subset_nhanes$demo_gender)

  Men Women 
  277   323 
table(subset_nhanes$demo_age_years)

18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 
 5  7 12  7 11 11 15  9 10  7  5  4 14 11 10 13 10 14 11  8  9 12 10 11  8 13 
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 
18 18 11  9  3 10  2 11  9  9  5 13  7 14  8 12 15 12 13 12 10  3 10  5  9  6 
70 71 72 73 74 75 76 77 78 79 80 
 8  4  9  7  4  3  7  6  3  4 34 
table(subset_nhanes$cc_smoke)

  Never  Former Current 
    344     131     125 
table(subset_nhanes$cc_bmi)

      <25 25 to <30 30 to <35       35+ 
      191       175       126       108 
table(subset_nhanes$bp_med_use)

 No Yes 
435 165 
table(subset_nhanes$cc_diabetes)

 No Yes 
522  78 

7 Simple Regression

7.1 Naive Analysis

Before starting with regression calibration, we first want to see what a model that only uses our noisy data looks like. We’ll use the same model formula as before (including age, race, smoking status etc.), the only difference being that we will use our error-full data, specifically the low reliability case.

naive.model=glm(cc_diabetes ~ bp_sys_mean_noise_low, family="binomial", data=subset_nhanes)
summary(naive.model)

Call:
glm(formula = cc_diabetes ~ bp_sys_mean_noise_low, family = "binomial", 
    data = subset_nhanes)

Coefficients:
                       Estimate Std. Error z value Pr(>|z|)    
(Intercept)           -2.519646   0.369018  -6.828 8.61e-12 ***
bp_sys_mean_noise_low  0.004826   0.002639   1.828   0.0675 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 463.66  on 599  degrees of freedom
Residual deviance: 460.29  on 598  degrees of freedom
AIC: 464.29

Number of Fisher Scoring iterations: 4
knitr::kable(tidy(naive.model, exponentiate = TRUE, conf.int = TRUE), digits = 3)
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 0.080 0.369 -6.828 0.000 0.038 0.162
bp_sys_mean_noise_low 1.005 0.003 1.828 0.067 1.000 1.010

Looking at the summary above, the significant coefficients, and therefore the variables which the model finds most useful in diabetes status prediction, are age, Non-Hispanic Black racial status, and BMI status, specifically for those in the 25 to 30 range. Note that our p value for our blood pressure variable is quite high, showing non-significance in the model.

7.2 Gold Standard Analysis

Next, we’ll look at our “true” model, the model which uses our unbiased, non-error blood pressure values:

true.model=glm(cc_diabetes ~ bp_sys_mean, family="binomial", data=subset_nhanes)
summary(true.model)

Call:
glm(formula = cc_diabetes ~ bp_sys_mean, family = "binomial", 
    data = subset_nhanes)

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept) -5.777250   0.787205  -7.339 2.15e-13 ***
bp_sys_mean  0.030330   0.005911   5.131 2.89e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 463.66  on 599  degrees of freedom
Residual deviance: 437.53  on 598  degrees of freedom
AIC: 441.53

Number of Fisher Scoring iterations: 5
knitr::kable(tidy(true.model, exponentiate = TRUE, conf.int = TRUE), digits = 3)
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 0.003 0.787 -7.339 0 0.001 0.014
bp_sys_mean 1.031 0.006 5.131 0 1.019 1.043

For this subset of the data, the true beta value for the systolic mean is … This is much higher than the model which we looked at previously, using all ~50,000 subjects, although it is to be expected since we restricted the data to n=150 samples. Also notice that the significance level

7.3 Regression Calibration

#Create a model with predicts the true BP value based on the noisy BP data
rcfit<-lm(bp_sys_mean ~ bp_sys_mean_noise_low, data=subset_nhanes)
summary(rcfit)

Call:
lm(formula = bp_sys_mean ~ bp_sys_mean_noise_low, data = subset_nhanes)

Residuals:
    Min      1Q  Median      3Q     Max 
-40.512 -12.063  -1.631   8.883  71.880 

Coefficients:
                       Estimate Std. Error t value Pr(>|t|)    
(Intercept)           105.08228    2.04124  51.480   <2e-16 ***
bp_sys_mean_noise_low   0.15067    0.01538   9.797   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 17.45 on 598 degrees of freedom
Multiple R-squared:  0.1383,    Adjusted R-squared:  0.1369 
F-statistic: 95.98 on 1 and 598 DF,  p-value: < 2.2e-16
subset_nhanes$bp_hat <- predict(rcfit, newdata=subset_nhanes) 

final.model = glm(cc_diabetes ~ bp_hat, family="binomial",data=subset_nhanes)
summary(final.model)

Call:
glm(formula = cc_diabetes ~ bp_hat, family = "binomial", data = subset_nhanes)

Coefficients:
            Estimate Std. Error z value Pr(>|z|)   
(Intercept) -5.88548    2.19259  -2.684  0.00727 **
bp_hat       0.03203    0.01752   1.828  0.06749 . 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 463.66  on 599  degrees of freedom
Residual deviance: 460.29  on 598  degrees of freedom
AIC: 464.29

Number of Fisher Scoring iterations: 4
knitr::kable(tidy(final.model, exponentiate = TRUE, conf.int = TRUE), digits = 3)
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 0.003 2.193 -2.684 0.007 0.000 0.196
bp_hat 1.033 0.018 1.828 0.067 0.998 1.069
print(paste("Correlation between BP_noise and BP_hat:", cor(subset_nhanes$bp_sys_mean_noise_low, subset_nhanes$bp_hat)))
[1] "Correlation between BP_noise and BP_hat: 0.999999999999997"

Fix Standard Errors:

#warning = False
set.seed(43)

bootstrap.functionV2<-function(dat,inds){
    subset_nhanes.boot<-dat[inds,]
    rcfit.boot<-lm(bp_sys_mean ~ bp_sys_mean_noise_low, data=subset_nhanes.boot)
    subset_nhanes.boot$bp_hat<-predict(rcfit.boot,newdata=subset_nhanes.boot)
    final.model= glm(cc_diabetes ~ bp_hat, family="binomial", data=subset_nhanes.boot)
    return(final.model$coef)
}

my.boot<-boot(subset_nhanes, bootstrap.functionV2, R=500)
bsSD <- apply(my.boot$t,2,sd)
bsSD
[1] 2.08979580 0.01666782
t.stat<-coef(final.model)/bsSD
t.stat
(Intercept)      bp_hat 
  -2.816292    1.921693 

Now we can recalculate the p value for the bp_hat intercept using the t statistic calculated above. We can do this using the pt() function, which returns a p value based on a t statistic and degrees of freedom.

Since our alternative hypothesis in this case is ____ we set the lower.tail parameter to false?.

p.value.bp <- 2 * (1 - pnorm(t.stat['bp_hat']))
print(p.value.bp)
    bp_hat 
0.05464444 

Here, while we don’t recover a significant p value, we do have a lower p value compared to the naive analysis (0.055 vs 0.067). Therefore, we are more confident in our conclusion that blood pressure has a correlation to diabetes risk. In this case, we can interpret the results that for one additional unit increase in systolic blood pressure, the estimated risk of having diabetes increases by 3.3%.

We can also recalculate the confidence intervals: using the log odds value for the bp_hat intercept 1.033 and the recalculated standard errors, we can perform a simple calculation to construct the confidence interval at a level of 95%.

conf.low <- 1.033 - 1.96*0.01666782
conf.high <- 1.033 + 1.96*0.01666782
conf.low
[1] 1.000331
conf.high
[1] 1.065669

We see that the confidence interval does not cover 1, indicating that there is a strong evidence that a positive correlation between blood pressure and diabetes exists with a 95% confidence level.

8 Selected Model Regression

8.1 Naive Analysis

Before starting with regression calibration, we first want to see what a model that only uses our noisy data looks like. We’ll use the same model formula as before (including age, race, smoking status etc.), the only difference being that we will use our error-full data, specifically the low reliability case.

naive.model=glm(cc_diabetes ~ bp_sys_mean_noise_low + demo_age_years + demo_gender + demo_race + cc_bmi + bp_med_use, family="binomial", data=subset_nhanes)
summary(naive.model)

Call:
glm(formula = cc_diabetes ~ bp_sys_mean_noise_low + demo_age_years + 
    demo_gender + demo_race + cc_bmi + bp_med_use, family = "binomial", 
    data = subset_nhanes)

Coefficients:
                             Estimate Std. Error z value Pr(>|z|)    
(Intercept)                 -6.360537   0.810700  -7.846 4.30e-15 ***
bp_sys_mean_noise_low        0.001768   0.002926   0.604 0.545658    
demo_age_years               0.047772   0.010231   4.669 3.02e-06 ***
demo_genderWomen            -0.593300   0.278581  -2.130 0.033195 *  
demo_raceNon-Hispanic Black  0.861090   0.365695   2.355 0.018539 *  
demo_raceNon-Hispanic Asian  1.151863   0.708336   1.626 0.103917    
demo_raceHispanic            1.286503   0.341675   3.765 0.000166 ***
demo_raceOther               0.780841   0.814489   0.959 0.337716    
cc_bmi25 to <30              0.650462   0.412058   1.579 0.114435    
cc_bmi30 to <35              1.234393   0.436396   2.829 0.004675 ** 
cc_bmi35+                    1.861836   0.453735   4.103 4.07e-05 ***
bp_med_useYes                0.771081   0.308493   2.500 0.012437 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 463.66  on 599  degrees of freedom
Residual deviance: 368.59  on 588  degrees of freedom
AIC: 392.59

Number of Fisher Scoring iterations: 6
knitr::kable(tidy(naive.model, exponentiate = TRUE, conf.int = TRUE), digits = 3)
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 0.002 0.811 -7.846 0.000 0.000 0.008
bp_sys_mean_noise_low 1.002 0.003 0.604 0.546 0.996 1.008
demo_age_years 1.049 0.010 4.669 0.000 1.029 1.071
demo_genderWomen 0.553 0.279 -2.130 0.033 0.318 0.950
demo_raceNon-Hispanic Black 2.366 0.366 2.355 0.019 1.157 4.884
demo_raceNon-Hispanic Asian 3.164 0.708 1.626 0.104 0.659 11.522
demo_raceHispanic 3.620 0.342 3.765 0.000 1.871 7.181
demo_raceOther 2.183 0.814 0.959 0.338 0.319 9.110
cc_bmi25 to <30 1.916 0.412 1.579 0.114 0.870 4.434
cc_bmi30 to <35 3.436 0.436 2.829 0.005 1.488 8.340
cc_bmi35+ 6.436 0.454 4.103 0.000 2.715 16.253
bp_med_useYes 2.162 0.308 2.500 0.012 1.184 3.982

Looking at the summary above, the significant coefficients, and therefore the variables which the model finds most useful in diabetes status prediction, are age, Non-Hispanic Black racial status, and BMI status, specifically for those in the 25 to 30 range. Note that our p value for our blood pressure variable is quite high, showing non-significance in the model.

8.2 Gold Standard Analysis

Next, we’ll look at our “true” model, the model which uses our unbiased, non-error blood pressure values:

true.model=glm(cc_diabetes ~ bp_sys_mean + demo_age_years + demo_gender + demo_race + cc_bmi + bp_med_use, family="binomial", data=subset_nhanes)
summary(true.model)

Call:
glm(formula = cc_diabetes ~ bp_sys_mean + demo_age_years + demo_gender + 
    demo_race + cc_bmi + bp_med_use, family = "binomial", data = subset_nhanes)

Coefficients:
                            Estimate Std. Error z value Pr(>|z|)    
(Intercept)                 -8.03913    1.12566  -7.142 9.22e-13 ***
bp_sys_mean                  0.01700    0.00734   2.316 0.020570 *  
demo_age_years               0.04199    0.01058   3.969 7.21e-05 ***
demo_genderWomen            -0.61506    0.28041  -2.193 0.028273 *  
demo_raceNon-Hispanic Black  0.88094    0.36250   2.430 0.015091 *  
demo_raceNon-Hispanic Asian  1.19350    0.71337   1.673 0.094321 .  
demo_raceHispanic            1.29849    0.34264   3.790 0.000151 ***
demo_raceOther               0.83248    0.82019   1.015 0.310114    
cc_bmi25 to <30              0.70851    0.41714   1.699 0.089413 .  
cc_bmi30 to <35              1.28751    0.44345   2.903 0.003692 ** 
cc_bmi35+                    1.97864    0.46266   4.277 1.90e-05 ***
bp_med_useYes                0.68256    0.30655   2.227 0.025973 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 463.66  on 599  degrees of freedom
Residual deviance: 363.63  on 588  degrees of freedom
AIC: 387.63

Number of Fisher Scoring iterations: 6
knitr::kable(tidy(true.model, exponentiate = TRUE, conf.int = TRUE), digits = 3)
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 0.000 1.126 -7.142 0.000 0.000 0.003
bp_sys_mean 1.017 0.007 2.316 0.021 1.003 1.032
demo_age_years 1.043 0.011 3.969 0.000 1.022 1.065
demo_genderWomen 0.541 0.280 -2.193 0.028 0.309 0.932
demo_raceNon-Hispanic Black 2.413 0.362 2.430 0.015 1.188 4.951
demo_raceNon-Hispanic Asian 3.299 0.713 1.673 0.094 0.682 12.160
demo_raceHispanic 3.664 0.343 3.790 0.000 1.889 7.279
demo_raceOther 2.299 0.820 1.015 0.310 0.333 9.724
cc_bmi25 to <30 2.031 0.417 1.699 0.089 0.914 4.749
cc_bmi30 to <35 3.624 0.443 2.903 0.004 1.550 8.925
cc_bmi35+ 7.233 0.463 4.277 0.000 3.006 18.630
bp_med_useYes 1.979 0.307 2.227 0.026 1.088 3.629

For this subset of the data, the true beta value for the systolic mean is … This is much higher than the model which we looked at previously, using all ~50,000 subjects, although it is to be expected since we restricted the data to n=150 samples. Also notice that the significance level

8.3 Regression Calibration

#Create a model with predicts the true BP value based on the noisy BP data
rcfit<-lm(bp_sys_mean ~ bp_sys_mean_noise_low + demo_age_years + demo_gender + demo_race + cc_bmi + bp_med_use, data=subset_nhanes)
summary(rcfit)

Call:
lm(formula = bp_sys_mean ~ bp_sys_mean_noise_low + demo_age_years + 
    demo_gender + demo_race + cc_bmi + bp_med_use, data = subset_nhanes)

Residuals:
    Min      1Q  Median      3Q     Max 
-38.474  -9.810  -1.711   8.647  63.218 

Coefficients:
                            Estimate Std. Error t value Pr(>|t|)    
(Intercept)                 89.56535    2.78350  32.177  < 2e-16 ***
bp_sys_mean_noise_low        0.11877    0.01382   8.596  < 2e-16 ***
demo_age_years               0.40142    0.04192   9.575  < 2e-16 ***
demo_genderWomen            -1.72991    1.27718  -1.354 0.176102    
demo_raceNon-Hispanic Black  0.63160    1.69938   0.372 0.710275    
demo_raceNon-Hispanic Asian -1.84153    2.77735  -0.663 0.507557    
demo_raceHispanic           -0.75026    1.56330  -0.480 0.631462    
demo_raceOther              -0.91382    3.07167  -0.297 0.766191    
cc_bmi25 to <30             -0.60550    1.62407  -0.373 0.709408    
cc_bmi30 to <35              0.23627    1.78558   0.132 0.894777    
cc_bmi35+                   -0.85737    1.93105  -0.444 0.657214    
bp_med_useYes                5.62935    1.68476   3.341 0.000887 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 15.27 on 588 degrees of freedom
Multiple R-squared:  0.3519,    Adjusted R-squared:  0.3397 
F-statistic: 29.02 on 11 and 588 DF,  p-value: < 2.2e-16
subset_nhanes$bp_hat <- predict(rcfit, newdata=subset_nhanes) 

final.model = glm(cc_diabetes ~ bp_hat + demo_age_years + demo_gender + demo_race + cc_bmi + bp_med_use, family="binomial",data=subset_nhanes)
summary(final.model)

Call:
glm(formula = cc_diabetes ~ bp_hat + demo_age_years + demo_gender + 
    demo_race + cc_bmi + bp_med_use, family = "binomial", data = subset_nhanes)

Coefficients:
                            Estimate Std. Error z value Pr(>|z|)    
(Intercept)                 -7.69381    2.60474  -2.954  0.00314 ** 
bp_hat                       0.01489    0.02463   0.604  0.54566    
demo_age_years               0.04180    0.01532   2.729  0.00635 ** 
demo_genderWomen            -0.56755    0.27962  -2.030  0.04238 *  
demo_raceNon-Hispanic Black  0.85169    0.36838   2.312  0.02078 *  
demo_raceNon-Hispanic Asian  1.17928    0.71127   1.658  0.09732 .  
demo_raceHispanic            1.29767    0.34073   3.809  0.00014 ***
demo_raceOther               0.79444    0.81493   0.975  0.32963    
cc_bmi25 to <30              0.65948    0.41242   1.599  0.10981    
cc_bmi30 to <35              1.23088    0.43622   2.822  0.00478 ** 
cc_bmi35+                    1.87460    0.45404   4.129 3.65e-05 ***
bp_med_useYes                0.68728    0.32203   2.134  0.03282 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 463.66  on 599  degrees of freedom
Residual deviance: 368.59  on 588  degrees of freedom
AIC: 392.59

Number of Fisher Scoring iterations: 6
knitr::kable(tidy(final.model, exponentiate = TRUE, conf.int = TRUE), digits = 3)
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 0.000 2.605 -2.954 0.003 0.000 0.071
bp_hat 1.015 0.025 0.604 0.546 0.967 1.065
demo_age_years 1.043 0.015 2.729 0.006 1.012 1.075
demo_genderWomen 0.567 0.280 -2.030 0.042 0.325 0.976
demo_raceNon-Hispanic Black 2.344 0.368 2.312 0.021 1.140 4.864
demo_raceNon-Hispanic Asian 3.252 0.711 1.658 0.097 0.674 11.916
demo_raceHispanic 3.661 0.341 3.809 0.000 1.895 7.248
demo_raceOther 2.213 0.815 0.975 0.330 0.323 9.243
cc_bmi25 to <30 1.934 0.412 1.599 0.110 0.877 4.478
cc_bmi30 to <35 3.424 0.436 2.822 0.005 1.484 8.308
cc_bmi35+ 6.518 0.454 4.129 0.000 2.749 16.476
bp_med_useYes 1.988 0.322 2.134 0.033 1.060 3.758
diabetes_test <- t.test(bp_hat ~ cc_diabetes, data = subset_nhanes)
diabetes_test

    Welch Two Sample t-test

data:  bp_hat by cc_diabetes
t = -7.6016, df = 110, p-value = 1.047e-11
alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
95 percent confidence interval:
 -11.246486  -6.595121
sample estimates:
 mean in group No mean in group Yes 
         122.6628          131.5836 
print(paste("Correlation between BP_noise and BP_hat:", cor(subset_nhanes$bp_sys_mean_noise_low, subset_nhanes$bp_hat)))
[1] "Correlation between BP_noise and BP_hat: 0.626950413178945"

Fix Standard Errors:

#warning = False
set.seed(43)

bootstrap.functionV2<-function(dat,inds){
    subset_nhanes.boot<-dat[inds,]
    rcfit.boot<-lm(bp_sys_mean ~ bp_sys_mean_noise_low + demo_age_years + demo_gender + demo_race + cc_bmi + bp_med_use, data=subset_nhanes.boot)
    subset_nhanes.boot$bp_hat<-predict(rcfit.boot,newdata=subset_nhanes.boot)
    final.model= glm(cc_diabetes ~ bp_hat, family="binomial", data=subset_nhanes.boot)
    return(final.model$coef)
}

my.boot<-boot(subset_nhanes, bootstrap.functionV2, R=500)
bsSD <- apply(my.boot$t,2,sd)
bsSD
[1] 1.70784464 0.01331138
t.stat<-coef(final.model)/bsSD
t.stat
                (Intercept)                      bp_hat 
                -4.50498173                  1.11829331 
             demo_age_years            demo_genderWomen 
                 0.02447299                -42.63630869 
demo_raceNon-Hispanic Black demo_raceNon-Hispanic Asian 
                 0.49869177                 88.59155106 
          demo_raceHispanic              demo_raceOther 
                 0.75983012                 59.68157177 
            cc_bmi25 to <30             cc_bmi30 to <35 
                 0.38614475                 92.46793953 
                  cc_bmi35+               bp_med_useYes 
                 1.09764009                 51.63115902 
p.value.bp <- 2 * (1 - pnorm(t.stat['bp_hat']))
print(p.value.bp)
   bp_hat 
0.2634417 
conf.low <- 1.015 - 1.96*0.01331138
conf.high <- 1.015 + 1.96*0.01331138
conf.low
[1] 0.9889097
conf.high
[1] 1.04109