library(dplyr)
library(tidyverse)
library(tibble)
library(ggplot2)
library(data.table)
library(DT)
library(broom)
library(devtools)
# library(viridis)
library(knitr)
# library(hrbrthemes)
library(car)
#using data specified in this github repository:
install_github("jhs-hwg/cardioStatsUSA")
library(cardioStatsUSA)
3 Modeling Diabetes Occurrence
3.1 Simple Diabetes Modeling
We are choosing Diabetes as the co-morbidity for modeling. We will try to predict the occurrence of diabetes as a function of blood pressure as well as other covariates.
The most basic model possible is a simple logistic regression model predicting diabetes status based on blood pressure:
#to prevent errors, exclude the rows with na:
= c('cc_diabetes', 'bp_sys_mean', 'demo_age_years', 'demo_race', 'demo_gender', 'cc_bmi', 'cc_smoke', 'bp_med_use')
used_vars <- nhanes_data[complete.cases(nhanes_data[,..used_vars]), ]
clean_nhanes
<- glm(cc_diabetes ~ bp_sys_mean, data = clean_nhanes, family = binomial)
simple_diab_model_sys
summary(simple_diab_model_sys)
Call:
glm(formula = cc_diabetes ~ bp_sys_mean, family = binomial, data = clean_nhanes)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -4.7658807 0.0796600 -59.83 <2e-16 ***
bp_sys_mean 0.0227452 0.0006007 37.87 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 41278 on 52196 degrees of freedom
Residual deviance: 39888 on 52195 degrees of freedom
AIC: 39892
Number of Fisher Scoring iterations: 4
::kable(tidy(simple_diab_model_sys, exponentiate = TRUE, conf.int = TRUE), digits = 3) knitr
term | estimate | std.error | statistic | p.value | conf.low | conf.high |
---|---|---|---|---|---|---|
(Intercept) | 0.009 | 0.080 | -59.828 | 0 | 0.007 | 0.010 |
bp_sys_mean | 1.023 | 0.001 | 37.866 | 0 | 1.022 | 1.024 |
<- glm(cc_diabetes ~ bp_dia_mean, data = clean_nhanes, family = binomial)
simple_diab_model_dia
summary(simple_diab_model_dia)
Call:
glm(formula = cc_diabetes ~ bp_dia_mean, family = binomial, data = clean_nhanes)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.406666 0.076384 -18.416 < 2e-16 ***
bp_dia_mean -0.006488 0.001074 -6.039 1.55e-09 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 41006 on 51976 degrees of freedom
Residual deviance: 40969 on 51975 degrees of freedom
(220 observations deleted due to missingness)
AIC: 40973
Number of Fisher Scoring iterations: 4
::kable(tidy(simple_diab_model_dia, exponentiate = TRUE, conf.int = TRUE), digits = 3) knitr
term | estimate | std.error | statistic | p.value | conf.low | conf.high |
---|---|---|---|---|---|---|
(Intercept) | 0.245 | 0.076 | -18.416 | 0 | 0.211 | 0.284 |
bp_dia_mean | 0.994 | 0.001 | -6.039 | 0 | 0.991 | 0.996 |
3.1.0.1 Model Explanation:
For the logistic regression using systolic blood pressure, we can see that the odds ratio beta parameter is 1.024. This implies that a higher systolic blood pressure corresponds to increased chances of observing diabetes.
A more exact interpretation would be that: for one additional unit increase in systolic blood pressure, the estimated risk of having diabetes increases by 2.4%.
This value may not seem extremely high, yet it still has a p value evaluated to be 0 by R. We can also see that the 95% confidence interval of this parameter ranges between 1.022 and 1.024, meaning that the model predicts that the true relationship falls within this interval with a 95% certainty, since the true value of the parameter is most likely within this interval.
The opposite is true with diastolic blood pressure: the beta parameter is 0.996, implying that higher diastolic blood pressure corresponds to a lower chance of observing diabetes. For an additional unit increase in diastolic blood pressure, the estimated risk of having diabetes decreases by 0.04%.
Note: systolic blood pressure is a better predictor of cardiovascular disease, so I chose to consider systolic rather than diastolic in future models.
3.2 Demographic Model
Next, we build a model including the demographic variables as well. This will hopefully provide a more accurate model, since it will have access to the further information in order to make predictions.
<- glm(cc_diabetes ~ bp_sys_mean + demo_age_years + demo_race + demo_gender,data = clean_nhanes, family = binomial)
demo_diab_model_sys
summary (demo_diab_model_sys)
Call:
glm(formula = cc_diabetes ~ bp_sys_mean + demo_age_years + demo_race +
demo_gender, family = binomial, data = clean_nhanes)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -5.6473577 0.0915063 -61.716 < 2e-16 ***
bp_sys_mean 0.0040216 0.0006986 5.756 8.60e-09 ***
demo_age_years 0.0516294 0.0009596 53.804 < 2e-16 ***
demo_raceNon-Hispanic Black 0.9071035 0.0357034 25.407 < 2e-16 ***
demo_raceNon-Hispanic Asian 0.7971431 0.0615302 12.955 < 2e-16 ***
demo_raceHispanic 0.8596099 0.0349608 24.588 < 2e-16 ***
demo_raceOther 0.7732236 0.0727607 10.627 < 2e-16 ***
demo_genderWomen -0.1472718 0.0270704 -5.440 5.32e-08 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 41278 on 52196 degrees of freedom
Residual deviance: 36211 on 52189 degrees of freedom
AIC: 36227
Number of Fisher Scoring iterations: 5
::kable(tidy(demo_diab_model_sys, exponentiate = TRUE, conf.int = TRUE), digits = 3) knitr
term | estimate | std.error | statistic | p.value | conf.low | conf.high |
---|---|---|---|---|---|---|
(Intercept) | 0.004 | 0.092 | -61.716 | 0 | 0.003 | 0.004 |
bp_sys_mean | 1.004 | 0.001 | 5.756 | 0 | 1.003 | 1.005 |
demo_age_years | 1.053 | 0.001 | 53.804 | 0 | 1.051 | 1.055 |
demo_raceNon-Hispanic Black | 2.477 | 0.036 | 25.407 | 0 | 2.310 | 2.657 |
demo_raceNon-Hispanic Asian | 2.219 | 0.062 | 12.955 | 0 | 1.965 | 2.501 |
demo_raceHispanic | 2.362 | 0.035 | 24.588 | 0 | 2.206 | 2.530 |
demo_raceOther | 2.167 | 0.073 | 10.627 | 0 | 1.876 | 2.495 |
demo_genderWomen | 0.863 | 0.027 | -5.440 | 0 | 0.818 | 0.910 |
3.2.0.1 Model Explanation
In this model, we can still see a slight upwards trend with systolic blood pressure, although it is a bit weaker (1.024 > 1.003). With this parameter, the estimated risk of having diabetes increases only by 0.3%. Although the effect is smaller, we can see that the confidence interval is between 1.003 and 1.005, which does not include 1. So, although small, the positive relationship is most likely significant.
3.2.0.2 Demographics
Age:
The model also tells us that age is positively correlated with the observation of diabetes. With a log odds ratio of 1.054, it seems that every year increases the risk of diabetes occurrence by 5.4%.
Race:
The log odds ratio for categorical variables compares the other variables to the baseline “reference group”. Here, the “reference group” for the race variable is Non-Hispanic White. We can see that, in comparison to this group, all other races have significantly higher risk of diabetes: for example, the Hispanic population’s risk of diabetes occurrence is 2.36 times higher than the Non-Hispanic White population.
Gender:
With this variable, the reference group is men. We can see that women have a lower odds of having diabetes compared to men. Specifically, the risk of diabetes in women is about 85.6% of that of men.
3.3 Full Model
Lastly, we want to create a model including confounding variables. To achieve this, we must first determine which other variables are confounding, meaning they have significant associations with diabetes.
We can achieve this by using chi squared test, which compares whether or not the observed ratios of diabetes and confounding variables matches the ratios expected by random chance. If the chi squared test determines significance, then diabetes and the other variables commonly occur together and therefore are confounding.
3.3.0.1 Perform Chi Squared Tests:
#Testing for BMI
<- nhanes_data %>% select(cc_diabetes, cc_bmi) %>% drop_na(cc_bmi) %>% drop_na(cc_diabetes)
nhanes_bmi_diab <- table(nhanes_bmi_diab)
nhanes_bmi_diab #nhanes_bmi_diab
chisq.test(nhanes_bmi_diab)
Pearson's Chi-squared test
data: nhanes_bmi_diab
X-squared = 2368.3, df = 3, p-value < 2.2e-16
#Testing for Smoking
<- nhanes_data %>% select(cc_diabetes, cc_smoke) %>% drop_na(cc_smoke) %>% drop_na(cc_diabetes)
nhanes_smoke_diab <- table(nhanes_smoke_diab)
nhanes_smoke_diab #nhanes_smoke_diab
chisq.test(nhanes_smoke_diab)
Pearson's Chi-squared test
data: nhanes_smoke_diab
X-squared = 435.81, df = 2, p-value < 2.2e-16
#Testing for Hypertensive Medication use
<- nhanes_data %>% select(cc_diabetes, bp_med_use) %>% drop_na(bp_med_use) %>% drop_na(cc_diabetes)
nhanes_med_diab <- table(nhanes_med_diab)
nhanes_med_diab #nhanes_med_diab
chisq.test(nhanes_med_diab)
Pearson's Chi-squared test with Yates' continuity correction
data: nhanes_med_diab
X-squared = 5807.1, df = 1, p-value < 2.2e-16
After this analysis, we can see that all three variables, BMI, Smoking, and Medication Use, are all confounding. We can include all of them in the next model, in addition to systolic blood pressure and the demographic variables:
<- glm(cc_diabetes ~ bp_sys_mean + demo_age_years + demo_race + demo_gender+ cc_bmi + cc_smoke + bp_med_use, data = clean_nhanes, family = binomial)
full_diab_model_sys
summary(full_diab_model_sys)
Call:
glm(formula = cc_diabetes ~ bp_sys_mean + demo_age_years + demo_race +
demo_gender + cc_bmi + cc_smoke + bp_med_use, family = binomial,
data = clean_nhanes)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -6.3003463 0.1112295 -56.643 < 2e-16 ***
bp_sys_mean 0.0017066 0.0007373 2.315 0.0206 *
demo_age_years 0.0470354 0.0011312 41.579 < 2e-16 ***
demo_raceNon-Hispanic Black 0.6987360 0.0374745 18.646 < 2e-16 ***
demo_raceNon-Hispanic Asian 1.3022903 0.0660355 19.721 < 2e-16 ***
demo_raceHispanic 0.8889260 0.0366792 24.235 < 2e-16 ***
demo_raceOther 0.8913458 0.0763522 11.674 < 2e-16 ***
demo_genderWomen -0.3082492 0.0295124 -10.445 < 2e-16 ***
cc_bmi25 to <30 0.5927344 0.0444524 13.334 < 2e-16 ***
cc_bmi30 to <35 1.1736075 0.0463565 25.317 < 2e-16 ***
cc_bmi35+ 1.7702868 0.0480051 36.877 < 2e-16 ***
cc_smokeFormer 0.0614704 0.0335729 1.831 0.0671 .
cc_smokeCurrent 0.1853436 0.0403687 4.591 4.41e-06 ***
bp_med_useYes 0.9121351 0.0314064 29.043 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 41278 on 52196 degrees of freedom
Residual deviance: 33055 on 52183 degrees of freedom
AIC: 33083
Number of Fisher Scoring iterations: 6
::kable(tidy(full_diab_model_sys, exponentiate = TRUE, conf.int = TRUE), digits = 3) knitr
term | estimate | std.error | statistic | p.value | conf.low | conf.high |
---|---|---|---|---|---|---|
(Intercept) | 0.002 | 0.111 | -56.643 | 0.000 | 0.001 | 0.002 |
bp_sys_mean | 1.002 | 0.001 | 2.315 | 0.021 | 1.000 | 1.003 |
demo_age_years | 1.048 | 0.001 | 41.579 | 0.000 | 1.046 | 1.050 |
demo_raceNon-Hispanic Black | 2.011 | 0.037 | 18.646 | 0.000 | 1.869 | 2.164 |
demo_raceNon-Hispanic Asian | 3.678 | 0.066 | 19.721 | 0.000 | 3.229 | 4.183 |
demo_raceHispanic | 2.433 | 0.037 | 24.235 | 0.000 | 2.264 | 2.614 |
demo_raceOther | 2.438 | 0.076 | 11.674 | 0.000 | 2.096 | 2.828 |
demo_genderWomen | 0.735 | 0.030 | -10.445 | 0.000 | 0.693 | 0.778 |
cc_bmi25 to <30 | 1.809 | 0.044 | 13.334 | 0.000 | 1.659 | 1.974 |
cc_bmi30 to <35 | 3.234 | 0.046 | 25.317 | 0.000 | 2.954 | 3.542 |
cc_bmi35+ | 5.873 | 0.048 | 36.877 | 0.000 | 5.347 | 6.454 |
cc_smokeFormer | 1.063 | 0.034 | 1.831 | 0.067 | 0.996 | 1.136 |
cc_smokeCurrent | 1.204 | 0.040 | 4.591 | 0.000 | 1.112 | 1.302 |
bp_med_useYes | 2.490 | 0.031 | 29.043 | 0.000 | 2.341 | 2.648 |
3.3.0.2 Model Explanation
The previous variables in this model continue to show the same patterns, although again, the effect of systolic blood pressure is again reduced due to a lower log odds ratio. Now, the p value seems to be much higher at 0.02 (which is still significant, but much less so). The demographic variables also have much the same pattern, although Non-Hispanic Asians seem to have a much higher odds ratio than in the previous model.
3.3.0.3 Covariates
BMI:
In this variable, the baseline reference level is bmi <25. In comparison, those with higher bmi have increased risk of having diabetes, with the risk increasing the higher the bmi becomes. For example, those with bmi from 25-30 have a 1.8 times higher risk, while those with a bmi of 35+ have a 5.8 times higher risk.
Smoking:
With a baseline level of nonsmoking, former smokers may have about a 6% higher risk of diabetes. This may not be significant, however, given that the p value is only 0.067 and the odds ratio of 1 falls within the confidence interval. Current smokers, on the other hand, have about a 20% higher risk, this time with a much lower p value of 4*10^-6
Blood Pressure Medication Use:
Compared to those who do not use blood pressure medication, individuals who do have about a 2.5 times higher risk of experiencing diabetes.
3.4 Model Selection
It is possible to find an even better model by modifying which variables we include in our logistic regression model. We can achieve this using model selection, which will selectively add or subtract our explanatory variables from the model and evaluate its performance to choose the final best set of variables.
Explanation of model selection and or AIC?
3.4.0.1 Backwards Selection:
= step(full_diab_model_sys) backwards
Start: AIC=33082.55
cc_diabetes ~ bp_sys_mean + demo_age_years + demo_race + demo_gender +
cc_bmi + cc_smoke + bp_med_use
Df Deviance AIC
<none> 33055 33083
- bp_sys_mean 1 33060 33086
- cc_smoke 2 33076 33100
- demo_gender 1 33164 33190
- bp_med_use 1 33909 33935
- demo_race 4 33930 33950
- cc_bmi 3 34756 34778
- demo_age_years 1 34931 34957
formula(backwards)
cc_diabetes ~ bp_sys_mean + demo_age_years + demo_race + demo_gender +
cc_bmi + cc_smoke + bp_med_use
It looks like the backwards selection model returned the same full model. In other words, removing variables does not improve its AIC, and therefore does not improve its performance.
3.4.0.2 Forward Selection
#start from a model without any predictors
<- glm(cc_diabetes ~ 1, data = clean_nhanes, family=binomial)
nothing summary(nothing)
Call:
glm(formula = cc_diabetes ~ 1, family = binomial, data = clean_nhanes)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.85919 0.01282 -145.1 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 41278 on 52196 degrees of freedom
Residual deviance: 41278 on 52196 degrees of freedom
AIC: 41280
Number of Fisher Scoring iterations: 4
= step(nothing,
forwards scope=list(lower=formula(nothing),upper=formula(full_diab_model_sys)), direction="forward")
Start: AIC=41279.9
cc_diabetes ~ 1
Df Deviance AIC
+ bp_med_use 1 37076 37080
+ demo_age_years 1 37252 37256
+ cc_bmi 3 39218 39226
+ bp_sys_mean 1 39888 39892
+ demo_race 4 40891 40901
+ cc_smoke 2 40911 40917
+ demo_gender 1 41242 41246
<none> 41278 41280
Step: AIC=37079.85
cc_diabetes ~ bp_med_use
Df Deviance AIC
+ demo_age_years 1 35675 35681
+ cc_bmi 3 35907 35917
+ demo_race 4 36597 36609
+ bp_sys_mean 1 36788 36794
+ cc_smoke 2 36968 36976
+ demo_gender 1 37009 37015
<none> 37076 37080
Step: AIC=35681.48
cc_diabetes ~ bp_med_use + demo_age_years
Df Deviance AIC
+ cc_bmi 3 34070 34082
+ demo_race 4 34834 34848
+ demo_gender 1 35624 35632
+ bp_sys_mean 1 35654 35662
+ cc_smoke 2 35670 35680
<none> 35675 35681
Step: AIC=34082.43
cc_diabetes ~ bp_med_use + demo_age_years + cc_bmi
Df Deviance AIC
+ demo_race 4 33213 33233
+ demo_gender 1 33963 33977
+ bp_sys_mean 1 34050 34064
+ cc_smoke 2 34054 34070
<none> 34070 34082
Step: AIC=33232.52
cc_diabetes ~ bp_med_use + demo_age_years + cc_bmi + demo_race
Df Deviance AIC
+ demo_gender 1 33081 33103
+ cc_smoke 2 33170 33194
+ bp_sys_mean 1 33208 33230
<none> 33213 33233
Step: AIC=33102.89
cc_diabetes ~ bp_med_use + demo_age_years + cc_bmi + demo_race +
demo_gender
Df Deviance AIC
+ cc_smoke 2 33060 33086
+ bp_sys_mean 1 33076 33100
<none> 33081 33103
Step: AIC=33085.88
cc_diabetes ~ bp_med_use + demo_age_years + cc_bmi + demo_race +
demo_gender + cc_smoke
Df Deviance AIC
+ bp_sys_mean 1 33055 33083
<none> 33060 33086
Step: AIC=33082.55
cc_diabetes ~ bp_med_use + demo_age_years + cc_bmi + demo_race +
demo_gender + cc_smoke + bp_sys_mean
The Forward Selection process again selects the full model.
3.4.1 Forward-Backward Selection
= step(nothing, list(lower=formula(nothing),upper=formula(full_diab_model_sys)),
bothways direction="both",trace=0)
formula(bothways)
cc_diabetes ~ bp_med_use + demo_age_years + cc_bmi + demo_race +
demo_gender + cc_smoke + bp_sys_mean
The Forward-Backward algorithm also selects all of the variables.
In this case, the best model (as determined by the AIC) includes all of the variables (systolic blood pressure, age, race, sex, smoking status, BMI, and medication use). In other situations, it may have returned just a few of these variables, but for this example, we can proceed with the full model.
4 Modeling for Subset of Data
#Read in the subset data
<- readRDS("nhanes_subset.rds") subset_nhanes
4.1 Simple Diabetes Modeling
We are choosing Diabetes as the co-morbidity for modeling. We will try to predict the occurrence of diabetes as a function of blood pressure as well as other covariates.
The most basic model possible is a simple logistic regression model predicting diabetes status based on blood pressure:
#to prevent errors, exclude the rows with na:
= c('cc_diabetes', 'bp_sys_mean', 'demo_age_years', 'demo_race', 'demo_gender', 'cc_bmi', 'cc_smoke', 'bp_med_use')
used_vars <- subset_nhanes[,..used_vars]
clean_subset head(clean_subset)
cc_diabetes | bp_sys_mean | demo_age_years | demo_race | demo_gender | cc_bmi | cc_smoke | bp_med_use |
---|---|---|---|---|---|---|---|
Yes | 134.0000 | 77 | Non-Hispanic White | Men | 35+ | Never | Yes |
Yes | 171.3333 | 63 | Hispanic | Women | <25 | Former | Yes |
Yes | 133.1667 | 61 | Non-Hispanic Black | Women | 35+ | Never | Yes |
Yes | 136.6667 | 40 | Hispanic | Women | 35+ | Never | Yes |
Yes | 177.3333 | 76 | Hispanic | Women | 25 to <30 | Former | Yes |
Yes | 132.0000 | 68 | Hispanic | Women | 35+ | Never | Yes |
<- glm(cc_diabetes ~ bp_sys_mean , data = clean_subset, family = binomial)
simple_diab_model_subset
summary(simple_diab_model_subset)
Call:
glm(formula = cc_diabetes ~ bp_sys_mean, family = binomial, data = clean_subset)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -5.777250 0.787205 -7.339 2.15e-13 ***
bp_sys_mean 0.030330 0.005911 5.131 2.89e-07 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 463.66 on 599 degrees of freedom
Residual deviance: 437.53 on 598 degrees of freedom
AIC: 441.53
Number of Fisher Scoring iterations: 5
::kable(tidy(simple_diab_model_subset, exponentiate = TRUE, conf.int = TRUE), digits = 3) knitr
term | estimate | std.error | statistic | p.value | conf.low | conf.high |
---|---|---|---|---|---|---|
(Intercept) | 0.003 | 0.787 | -7.339 | 0 | 0.001 | 0.014 |
bp_sys_mean | 1.031 | 0.006 | 5.131 | 0 | 1.019 | 1.043 |
#simple_diab_model_dia <- glm(cc_diabetes ~ bp_dia_mean, data = clean_nhanes, family = binomial)
#summary(simple_diab_model_dia)
#knitr::kable(tidy(simple_diab_model_dia, exponentiate = TRUE, conf.int = TRUE), digits = 3)
4.1.0.1 Model Explanation:
For the logistic regression using systolic blood pressure, we can see that the odds ratio beta parameter is 1.024. This implies that a higher systolic blood pressure corresponds to increased chances of observing diabetes.
A more exact interpretation would be that: for one additional unit increase in systolic blood pressure, the estimated risk of having diabetes increases by 2.4%.
This value may not seem extremely high, yet it still has a p value evaluated to be 0 by R. We can also see that the 95% confidence interval of this parameter ranges between 1.022 and 1.024, meaning that the model predicts that the true relationship falls within this interval with a 95% certainty, since the true value of the parameter is most likely within this interval.
The opposite is true with diastolic blood pressure: the beta parameter is 0.996, implying that higher diastolic blood pressure corresponds to a lower chance of observing diabetes. For an additional unit increase in diastolic blood pressure, the estimated risk of having diabetes decreases by 0.04%.
Note: systolic blood pressure is a better predictor of cardiovascular disease, so I chose to consider systolic rather than diastolic in future models.
4.2 Demographic Model
Next, we build a model including the demographic variables as well. This will hopefully provide a more accurate model, since it will have access to the further information in order to make predictions.
<- glm(cc_diabetes ~ bp_sys_mean + demo_age_years + demo_race + demo_gender,data = clean_subset, family = binomial)
demo_diab_model_subset
summary (demo_diab_model_subset)
Call:
glm(formula = cc_diabetes ~ bp_sys_mean + demo_age_years + demo_race +
demo_gender, family = binomial, data = clean_subset)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -6.542316 0.898829 -7.279 3.37e-13 ***
bp_sys_mean 0.014810 0.006876 2.154 0.03124 *
demo_age_years 0.042030 0.009080 4.629 3.67e-06 ***
demo_raceNon-Hispanic Black 1.060715 0.345794 3.067 0.00216 **
demo_raceNon-Hispanic Asian 0.333315 0.675457 0.493 0.62168
demo_raceHispanic 1.216336 0.329225 3.695 0.00022 ***
demo_raceOther 0.267046 0.803168 0.332 0.73952
demo_genderWomen -0.383040 0.262182 -1.461 0.14402
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 463.66 on 599 degrees of freedom
Residual deviance: 396.40 on 592 degrees of freedom
AIC: 412.4
Number of Fisher Scoring iterations: 5
::kable(tidy(demo_diab_model_subset, exponentiate = TRUE, conf.int = TRUE), digits = 3) knitr
term | estimate | std.error | statistic | p.value | conf.low | conf.high |
---|---|---|---|---|---|---|
(Intercept) | 0.001 | 0.899 | -7.279 | 0.000 | 0.000 | 0.008 |
bp_sys_mean | 1.015 | 0.007 | 2.154 | 0.031 | 1.001 | 1.029 |
demo_age_years | 1.043 | 0.009 | 4.629 | 0.000 | 1.025 | 1.062 |
demo_raceNon-Hispanic Black | 2.888 | 0.346 | 3.067 | 0.002 | 1.470 | 5.741 |
demo_raceNon-Hispanic Asian | 1.396 | 0.675 | 0.493 | 0.622 | 0.304 | 4.681 |
demo_raceHispanic | 3.375 | 0.329 | 3.695 | 0.000 | 1.785 | 6.527 |
demo_raceOther | 1.306 | 0.803 | 0.332 | 0.740 | 0.193 | 5.255 |
demo_genderWomen | 0.682 | 0.262 | -1.461 | 0.144 | 0.406 | 1.138 |
4.2.0.1 Model Explanation
In this model, we can still see a slight upwards trend with systolic blood pressure, although it is a bit weaker (1.024 > 1.003). With this parameter, the estimated risk of having diabetes increases only by 0.3%. Although the effect is smaller, we can see that the confidence interval is between 1.003 and 1.005, which does not include 1. So, although small, the positive relationship is most likely significant.
4.2.0.2 Demographics
Age:
The model also tells us that age is positively correlated with the observation of diabetes. With a log odds ratio of 1.054, it seems that every year increases the risk of diabetes occurrence by 5.4%.
Race:
The log odds ratio for categorical variables compares the other variables to the baseline “reference group”. Here, the “reference group” for the race variable is Non-Hispanic White. We can see that, in comparison to this group, all other races have significantly higher risk of diabetes: for example, the Hispanic population’s risk of diabetes occurrence is 2.36 times higher than the Non-Hispanic White population.
Gender:
With this variable, the reference group is men. We can see that women have a lower odds of having diabetes compared to men. Specifically, the risk of diabetes in women is about 85.6% of that of men.
4.3 Full Model
Lastly, we want to create a model including confounding variables. To achieve this, we must first determine which other variables are confounding, meaning they have significant associations with diabetes.
We can achieve this by using chi squared test, which compares whether or not the observed ratios of diabetes and confounding variables matches the ratios expected by random chance. If the chi squared test determines significance, then diabetes and the other variables commonly occur together and therefore are confounding.
4.3.0.1 Perform Chi Squared Tests:
#Testing for BMI
<- clean_subset %>% select(cc_diabetes, cc_bmi) %>% drop_na(cc_bmi) %>% drop_na(cc_diabetes)
nhanes_bmi_diab_subset <- table(nhanes_bmi_diab_subset)
nhanes_bmi_diab_subset #nhanes_bmi_diab
chisq.test(nhanes_bmi_diab_subset)
Pearson's Chi-squared test
data: nhanes_bmi_diab_subset
X-squared = 17.898, df = 3, p-value = 0.0004616
#Testing for Smoking
<- clean_subset %>% select(cc_diabetes, cc_smoke) %>% drop_na(cc_smoke) %>% drop_na(cc_diabetes)
nhanes_smoke_diab_subset <- table(nhanes_smoke_diab_subset)
nhanes_smoke_diab_subset #nhanes_smoke_diab
chisq.test(nhanes_smoke_diab_subset)
Pearson's Chi-squared test
data: nhanes_smoke_diab_subset
X-squared = 6.2815, df = 2, p-value = 0.04325
#Testing for Hypertensive Medication use
<- clean_subset %>% select(cc_diabetes, bp_med_use) %>% drop_na(bp_med_use) %>% drop_na(cc_diabetes)
nhanes_med_diab_subset <- table(nhanes_med_diab_subset)
nhanes_med_diab_subset #nhanes_med_diab
chisq.test(nhanes_med_diab_subset)
Pearson's Chi-squared test with Yates' continuity correction
data: nhanes_med_diab_subset
X-squared = 42.751, df = 1, p-value = 6.217e-11
After this analysis, we can see that all three variables, BMI, Smoking, and Medication Use, are all confounding. We can include all of them in the next model, in addition to systolic blood pressure and the demographic variables:
<- glm(cc_diabetes ~ bp_sys_mean + demo_age_years + demo_race + demo_gender+ cc_bmi + cc_smoke + bp_med_use, data = clean_subset, family = binomial)
full_diab_model_subset
summary(full_diab_model_subset)
Call:
glm(formula = cc_diabetes ~ bp_sys_mean + demo_age_years + demo_race +
demo_gender + cc_bmi + cc_smoke + bp_med_use, family = binomial,
data = clean_subset)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -8.374715 1.157486 -7.235 4.65e-13 ***
bp_sys_mean 0.017214 0.007388 2.330 0.019811 *
demo_age_years 0.042340 0.010993 3.851 0.000117 ***
demo_raceNon-Hispanic Black 0.882174 0.365983 2.410 0.015934 *
demo_raceNon-Hispanic Asian 1.236127 0.725126 1.705 0.088249 .
demo_raceHispanic 1.348695 0.346960 3.887 0.000101 ***
demo_raceOther 0.877061 0.821270 1.068 0.285551
demo_genderWomen -0.514349 0.288090 -1.785 0.074200 .
cc_bmi25 to <30 0.706520 0.420936 1.678 0.093260 .
cc_bmi30 to <35 1.285068 0.447034 2.875 0.004045 **
cc_bmi35+ 1.994383 0.466486 4.275 1.91e-05 ***
cc_smokeFormer 0.435102 0.326010 1.335 0.181998
cc_smokeCurrent 0.522143 0.365577 1.428 0.153213
bp_med_useYes 0.661943 0.308302 2.147 0.031788 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 463.66 on 599 degrees of freedom
Residual deviance: 360.71 on 586 degrees of freedom
AIC: 388.71
Number of Fisher Scoring iterations: 6
::kable(tidy(full_diab_model_subset, exponentiate = TRUE, conf.int = TRUE), digits = 3) knitr
term | estimate | std.error | statistic | p.value | conf.low | conf.high |
---|---|---|---|---|---|---|
(Intercept) | 0.000 | 1.157 | -7.235 | 0.000 | 0.000 | 0.002 |
bp_sys_mean | 1.017 | 0.007 | 2.330 | 0.020 | 1.003 | 1.032 |
demo_age_years | 1.043 | 0.011 | 3.851 | 0.000 | 1.021 | 1.067 |
demo_raceNon-Hispanic Black | 2.416 | 0.366 | 2.410 | 0.016 | 1.181 | 4.993 |
demo_raceNon-Hispanic Asian | 3.442 | 0.725 | 1.705 | 0.088 | 0.697 | 12.982 |
demo_raceHispanic | 3.852 | 0.347 | 3.887 | 0.000 | 1.971 | 7.724 |
demo_raceOther | 2.404 | 0.821 | 1.068 | 0.286 | 0.348 | 10.201 |
demo_genderWomen | 0.598 | 0.288 | -1.785 | 0.074 | 0.338 | 1.048 |
cc_bmi25 to <30 | 2.027 | 0.421 | 1.678 | 0.093 | 0.905 | 4.771 |
cc_bmi30 to <35 | 3.615 | 0.447 | 2.875 | 0.004 | 1.534 | 8.959 |
cc_bmi35+ | 7.348 | 0.466 | 4.275 | 0.000 | 3.031 | 19.065 |
cc_smokeFormer | 1.545 | 0.326 | 1.335 | 0.182 | 0.810 | 2.921 |
cc_smokeCurrent | 1.686 | 0.366 | 1.428 | 0.153 | 0.812 | 3.427 |
bp_med_useYes | 1.939 | 0.308 | 2.147 | 0.032 | 1.062 | 3.566 |
4.3.0.2 Model Explanation
The previous variables in this model continue to show the same patterns, although again, the effect of systolic blood pressure is again reduced due to a lower log odds ratio. Now, the p value seems to be much higher at 0.02 (which is still significant, but much less so). The demographic variables also have much the same pattern, although Non-Hispanic Asians seem to have a much higher odds ratio than in the previous model.
4.3.0.3 Covariates
BMI:
In this variable, the baseline reference level is bmi <25. In comparison, those with higher bmi have increased risk of having diabetes, with the risk increasing the higher the bmi becomes. For example, those with bmi from 25-30 have a 1.8 times higher risk, while those with a bmi of 35+ have a 5.8 times higher risk.
Smoking:
With a baseline level of nonsmoking, former smokers may have about a 6% higher risk of diabetes. This may not be significant, however, given that the p value is only 0.067 and the odds ratio of 1 falls within the confidence interval. Current smokers, on the other hand, have about a 20% higher risk, this time with a much lower p value of 4*10^-6
Blood Pressure Medication Use:
Compared to those who do not use blood pressure medication, individuals who do have about a 2.5 times higher risk of experiencing diabetes.
4.4 Model Selection
It is possible to find an even better model by modifying which variables we include in our logistic regression model. We can achieve this using model selection, which will selectively add or subtract our explanatory variables from the model and evaluate its performance to choose the final best set of variables.
Explanation of model selection and or AIC?
4.4.0.1 Backwards Selection:
= step(full_diab_model_subset) backwards
Start: AIC=388.71
cc_diabetes ~ bp_sys_mean + demo_age_years + demo_race + demo_gender +
cc_bmi + cc_smoke + bp_med_use
Df Deviance AIC
- cc_smoke 2 363.63 387.63
<none> 360.71 388.71
- demo_gender 1 363.93 389.93
- bp_med_use 1 365.35 391.35
- bp_sys_mean 1 366.10 392.10
- demo_race 4 377.67 397.67
- demo_age_years 1 376.47 402.47
- cc_bmi 3 383.05 405.05
Step: AIC=387.63
cc_diabetes ~ bp_sys_mean + demo_age_years + demo_race + demo_gender +
cc_bmi + bp_med_use
Df Deviance AIC
<none> 363.63 387.63
- demo_gender 1 368.53 390.53
- bp_med_use 1 368.62 390.62
- bp_sys_mean 1 368.96 390.96
- demo_race 4 379.88 395.88
- demo_age_years 1 380.29 402.29
- cc_bmi 3 385.97 403.97
formula(backwards)
cc_diabetes ~ bp_sys_mean + demo_age_years + demo_race + demo_gender +
cc_bmi + bp_med_use
It looks like the backwards selection model returned the same full model. In other words, removing variables does not improve its AIC, and therefore does not improve its performance.
4.4.0.2 Forward Selection
#start from a model without any predictors
<- glm(cc_diabetes ~ 1, data = subset_nhanes, family=binomial)
nothing summary(nothing)
Call:
glm(formula = cc_diabetes ~ 1, family = binomial, data = subset_nhanes)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.9010 0.1214 -15.66 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 463.66 on 599 degrees of freedom
Residual deviance: 463.66 on 599 degrees of freedom
AIC: 465.66
Number of Fisher Scoring iterations: 4
= step(nothing,
forwards scope=list(lower=formula(nothing),upper=formula(full_diab_model_subset)), direction="forward")
Start: AIC=465.66
cc_diabetes ~ 1
Df Deviance AIC
+ demo_age_years 1 421.07 425.07
+ bp_med_use 1 423.90 427.90
+ bp_sys_mean 1 437.53 441.53
+ cc_bmi 3 445.01 453.01
+ demo_race 4 447.81 457.81
+ cc_smoke 2 457.72 463.72
+ demo_gender 1 460.78 464.78
<none> 463.66 465.66
Step: AIC=425.07
cc_diabetes ~ demo_age_years
Df Deviance AIC
+ cc_bmi 3 396.85 406.85
+ bp_med_use 1 407.64 413.64
+ demo_race 4 402.84 414.84
+ bp_sys_mean 1 416.24 422.24
<none> 421.07 425.07
+ demo_gender 1 419.20 425.20
+ cc_smoke 2 418.19 426.19
Step: AIC=406.85
cc_diabetes ~ demo_age_years + cc_bmi
Df Deviance AIC
+ demo_race 4 378.39 396.39
+ bp_med_use 1 390.28 402.28
+ bp_sys_mean 1 390.48 402.48
+ demo_gender 1 393.36 405.36
<none> 396.85 406.85
+ cc_smoke 2 393.19 407.19
Step: AIC=396.39
cc_diabetes ~ demo_age_years + cc_bmi + demo_race
Df Deviance AIC
+ bp_sys_mean 1 372.48 392.48
+ bp_med_use 1 373.46 393.46
+ demo_gender 1 375.00 395.00
+ cc_smoke 2 373.62 395.62
<none> 378.39 396.39
Step: AIC=392.48
cc_diabetes ~ demo_age_years + cc_bmi + demo_race + bp_sys_mean
Df Deviance AIC
+ bp_med_use 1 368.53 390.53
+ demo_gender 1 368.62 390.62
+ cc_smoke 2 367.76 391.76
<none> 372.48 392.48
Step: AIC=390.53
cc_diabetes ~ demo_age_years + cc_bmi + demo_race + bp_sys_mean +
bp_med_use
Df Deviance AIC
+ demo_gender 1 363.63 387.63
+ cc_smoke 2 363.93 389.93
<none> 368.53 390.53
Step: AIC=387.63
cc_diabetes ~ demo_age_years + cc_bmi + demo_race + bp_sys_mean +
bp_med_use + demo_gender
Df Deviance AIC
<none> 363.63 387.63
+ cc_smoke 2 360.71 388.71
The Forward Selection process again selects the full model.
4.4.1 Forward-Backward Selection
= step(nothing, list(lower=formula(nothing),upper=formula(full_diab_model_subset)),
bothways direction="both",trace=0)
formula(bothways)
cc_diabetes ~ demo_age_years + cc_bmi + demo_race + bp_sys_mean +
bp_med_use + demo_gender
The Forward-Backward algorithm also selects all of the variables.
In this case, the best model (as determined by the AIC) includes all of the variables (systolic blood pressure, age, race, sex, smoking status, BMI, and medication use). In other situations, it may have returned just a few of these variables, but for this example, we can proceed with the full model.