library(dplyr)
library(tidyverse)
library(gtsummary)
library(tibble)
library(ggplot2)
library(data.table)
library(DT)
library(broom)
library(devtools)
library(viridis)
library(knitr)
library(hrbrthemes)
library(car)
#using data specified in this github repository:
install_github("jhs-hwg/cardioStatsUSA")
library(cardioStatsUSA)
2 Data Visualization
head(nhanes_data)
svy_id | svy_weight_mec | svy_psu | svy_strata | svy_year | svy_subpop_htn | svy_subpop_chol | demo_age_cat | demo_race | demo_race_black | demo_age_years | demo_pregnant | demo_gender | bp_sys_mean | bp_dia_mean | bp_cat_meds_excluded | bp_cat_meds_included | bp_control_jnc7 | bp_control_accaha | bp_control_escesh_1 | bp_control_escesh_2 | bp_control_140_90 | bp_control_130_80 | bp_uncontrolled_jnc7 | bp_uncontrolled_accaha | bp_uncontrolled_escesh_1 | bp_uncontrolled_escesh_2 | bp_uncontrolled_140_90 | bp_uncontrolled_130_80 | bp_med_use | bp_med_recommended_jnc7 | bp_med_recommended_accaha | bp_med_recommended_escesh | bp_med_n_class | bp_med_n_pills | bp_med_combination | bp_med_pills_gteq_2 | bp_med_ace | bp_med_aldo | bp_med_alpha | bp_med_angioten | bp_med_beta | bp_med_central | bp_med_ccb | bp_med_ccb_dh | bp_med_ccb_ndh | bp_med_diur_Ksparing | bp_med_diur_loop | bp_med_diur_thz | bp_med_renin_inhibitors | bp_med_vasod | htn_jnc7 | htn_accaha | htn_escesh | htn_aware | htn_resistant_jnc7 | htn_resistant_accaha | htn_resistant_jnc7_thz | htn_resistant_accaha_thz | chol_measured_never | chol_measured_last | chol_total | chol_total_gteq_200 | chol_total_gteq_240 | chol_hdl | chol_hdl_low | chol_trig | chol_trig_gteq_150 | chol_ldl | chol_ldl_5cat | chol_ldl_lt_70 | chol_ldl_gteq_70 | chol_ldl_lt_100 | chol_ldl_gteq_100 | chol_ldl_gteq_190 | chol_ldl_persistent | chol_nonhdl | chol_nonhdl_5cat | chol_nonhdl_lt_100 | chol_nonhdl_gteq_100 | chol_nonhdl_gteq_220 | chol_med_use | chol_med_use_sr | chol_med_statin | chol_med_ezetimibe | chol_med_pcsk9i | chol_med_bile | chol_med_fibric_acid | chol_med_atorvastatin | chol_med_simvastatin | chol_med_rosuvastatin | chol_med_pravastatin | chol_med_pitavastatin | chol_med_fluvastatin | chol_med_lovastatin | chol_med_other | chol_med_addon_use | chol_med_addon_recommended_ahaacc | chol_med_statin_recommended_ahaacc | chol_med_recommended_ever | ascvd_risk_vh_ahaacc | cc_smoke | cc_bmi | cc_diabetes | cc_ckd | cc_acr | cc_egfr | cc_hba1c | cc_egfr_lt60 | cc_acr_gteq30 | cc_cvd_mi | cc_cvd_chd | cc_cvd_stroke | cc_cvd_ascvd | cc_cvd_hf | cc_cvd_any |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
12 | 95494.214 | 2 | 6 | 1999-2000 | 1 | 1 | 18 to 44 | Non-Hispanic White | No | 37 | No | Men | 176.6667 | 102.00000 | SBP 160+ or DBP 100+ mm Hg | taking antihypertensive medications | No | No | No | No | No | No | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | One | One | No | No | Yes | No | No | No | No | No | No | No | No | No | No | No | No | No | Yes | Yes | Yes | Yes | No | No | No | No | Cholesterol has been measured previously | 1 to 5 years ago | 156 | No | No | 38 | Yes | 146 | No | 90.87551 | 70 to <100 mg/dL | No | Yes | Yes | No | No | No | 118 | 100 to <130 mg/dL | No | Yes | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | Never | 30 to <35 | No | Yes | 34.646465 | 83.19409 | 5.2 | No | Yes | No | No | No | No | No | No |
21 | 2910.630 | 2 | 3 | 1999-2000 | 1 | 1 | 18 to 44 | Hispanic | No | 18 | No | Men | 121.3333 | 80.00000 | SBP of 130 to <140 or DBP 80 to <90 mm Hg | SBP of 130 to <140 or DBP 80 to <90 mm Hg | Yes | No | Yes | No | Yes | No | No | Yes | No | Yes | No | Yes | No | No | No | No | None | None | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | Yes | No | No | No | No | No | No | Cholesterol has been measured previously | NA | 161 | No | No | 34 | Yes | 120 | No | 104.23559 | 100 to <130 mg/dL | No | Yes | No | Yes | No | No | 127 | 100 to <130 mg/dL | No | Yes | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | NA | 35+ | No | No | 1.600000 | 117.88244 | 5.0 | No | No | NA | NA | NA | No | NA | No |
27 | 1935.098 | 2 | 13 | 1999-2000 | 1 | 0 | 18 to 44 | Hispanic | No | 18 | No | Men | 118.0000 | 78.00000 | SBP <120 and DBP <80 mm Hg | SBP <120 and DBP <80 mm Hg | Yes | Yes | Yes | Yes | Yes | Yes | No | No | No | No | No | No | No | No | No | No | None | None | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 25 to <30 | No | No | 4.800000 | NA | NA | NA | No | NA | NA | NA | No | NA | No |
28 | 41107.191 | 2 | 1 | 1999-2000 | 1 | 1 | 18 to 44 | Non-Hispanic White | No | 18 | No | Men | 94.0000 | 42.66667 | SBP <120 and DBP <80 mm Hg | SBP <120 and DBP <80 mm Hg | Yes | Yes | Yes | Yes | Yes | Yes | No | No | No | No | No | No | No | No | No | No | None | None | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | Cholesterol has been measured previously | NA | 151 | No | No | 64 | No | 52 | No | 75.74248 | 70 to <100 mg/dL | No | Yes | Yes | No | No | No | 87 | < 100 mg/dL | Yes | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | NA | <25 | No | No | 13.127572 | 139.84511 | 4.8 | No | No | NA | NA | NA | No | NA | No |
56 | 89768.526 | 1 | 12 | 1999-2000 | 1 | 0 | 18 to 44 | Non-Hispanic White | No | 21 | No | Men | 121.0000 | 65.00000 | SBP of 120 to <130 and DBP <80 mm Hg | SBP of 120 to <130 and DBP < 80 mm Hg | Yes | Yes | Yes | Yes | Yes | Yes | No | No | No | No | No | No | No | No | No | No | None | None | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | Former | <25 | No | No | 2.715232 | 126.51034 | 5.5 | No | No | No | No | No | No | No | No |
57 | 102609.750 | 2 | 5 | 1999-2000 | 1 | 1 | 18 to 44 | Non-Hispanic White | No | 39 | No | Men | 119.3333 | 85.33333 | SBP of 130 to <140 or DBP 80 to <90 mm Hg | SBP of 130 to <140 or DBP 80 to <90 mm Hg | Yes | No | Yes | No | Yes | No | No | Yes | No | Yes | No | Yes | No | No | No | No | None | None | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | Yes | No | Yes | No | No | No | No | Cholesterol has been measured previously | >5 years ago (possibly never) | 243 | Yes | Yes | 46 | Yes | 97 | No | 179.25406 | 130 to <190 mg/dL | No | Yes | No | Yes | No | No | 197 | 160 to <220 mg/dL | No | Yes | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | No | Never | 25 to <30 | No | No | 1.028807 | 91.68801 | 4.9 | No | No | No | No | No | No | No | No |
print(nrow(nhanes_data))
[1] 59799
print(nrow(nhanes_data[svy_subpop_htn == 1]))
[1] 56017
2.1 Descriptive Graphs
2.1.1 Demographic Histograms (Blood Pressure vs Demographic Variables)
Systolic vs Age and Systolic vs Gender are very clear
%>% drop_na(bp_dia_mean) %>%
nhanes_data ggplot(aes(x=bp_dia_mean, color=demo_age_cat)) +
geom_histogram(fill="white", alpha=0.5, bins = 80) +
ggtitle("Blood Pressure (Diastolic) with Age")
%>% drop_na(bp_sys_mean) %>%
nhanes_data ggplot(aes(x=bp_sys_mean, color=demo_age_cat)) +
geom_histogram(fill="white", alpha=0.5, bins = 80) +
ggtitle("Blood Pressure (Systolic) with Age")
%>% drop_na(bp_sys_mean) %>%
nhanes_data ggplot(aes(x=bp_sys_mean, color=demo_race)) +
geom_histogram(fill="white", alpha=0.5, bins = 80) +
ggtitle("Blood Pressure (Systolic) with Race")
%>% drop_na(bp_sys_mean) %>%
nhanes_data ggplot(aes(x=bp_sys_mean, color=demo_gender)) +
geom_histogram(fill="white", alpha=0.5, bins = 80) +
ggtitle("Blood Pressure (Systolic) with Gender")
<- nhanes_data[nhanes_data$demo_gender == "Women", ]
nhanes_female %>% drop_na(demo_pregnant) %>%
nhanes_female ggplot(aes(x=bp_sys_mean, color=demo_pregnant)) +
geom_histogram(fill="white", alpha=0.5, bins = 80) +
ggtitle("Blood Pressure (Systolic) with Pregnancy (Men Excluded)")
Warning: Removed 1678 rows containing non-finite outside the scale range
(`stat_bin()`).
%>% drop_na(demo_pregnant) %>%
nhanes_female ggplot(aes(x=bp_dia_mean, color=demo_pregnant)) +
geom_histogram(fill="white", alpha=0.5, bins = 80) +
ggtitle("Blood Pressure (Diastolic) with Pregnancy (Men Excluded)")
Warning: Removed 1809 rows containing non-finite outside the scale range
(`stat_bin()`).
2.1.2 Comorbidities Box Plots (Blood Pressure vs Covariate)
%>% drop_na(cc_smoke) %>%
nhanes_data ggplot(aes(x=cc_smoke, y=bp_sys_mean, fill=cc_smoke)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6) +
theme_ipsum() +
theme(
legend.position="none",
plot.title = element_text(size=11)
+
) ggtitle("Blood Pressure VS Smoking Status") +
xlab("")
Warning: Removed 3111 rows containing non-finite outside the scale range
(`stat_boxplot()`).
%>% drop_na(cc_bmi) %>%
nhanes_data ggplot(aes(x=cc_bmi, y=bp_sys_mean, fill=cc_bmi)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6) +
theme_ipsum() +
theme(
legend.position="none",
plot.title = element_text(size=11)
+
) ggtitle("Blood Pressure VS BMI Status") +
xlab("")
Warning: Removed 2867 rows containing non-finite outside the scale range
(`stat_boxplot()`).
%>% drop_na(cc_cvd_chd) %>%
nhanes_data ggplot(aes(x=cc_cvd_chd, y=bp_sys_mean, fill=cc_cvd_chd)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6) +
theme_ipsum() +
theme(
legend.position="none",
plot.title = element_text(size=11)
+
) ggtitle("Blood Pressure VS Chronic Heart Disease Status") +
xlab("")
Warning: Removed 3049 rows containing non-finite outside the scale range
(`stat_boxplot()`).
%>% drop_na(cc_ckd) %>%
nhanes_data ggplot(aes(x=cc_ckd, y=bp_sys_mean, fill=cc_ckd)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6) +
theme_ipsum() +
theme(
legend.position="none",
plot.title = element_text(size=11)
+
) ggtitle("Blood Pressure VS Chronic Kidney Disease Status") +
xlab("")
Warning: Removed 3265 rows containing non-finite outside the scale range
(`stat_boxplot()`).
%>% drop_na(cc_diabetes) %>%
nhanes_data ggplot(aes(x=cc_diabetes, y=bp_sys_mean, fill=cc_diabetes)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6) +
theme_ipsum() +
theme(
legend.position="none",
plot.title = element_text(size=11)
+
) ggtitle("Blood Pressure VS Diabetes") +
xlab("")
Warning: Removed 3265 rows containing non-finite outside the scale range
(`stat_boxplot()`).
2.1.3 Comorbidities Histograms (Blood Pressure vs Covariate)
%>% drop_na(cc_smoke) %>%
nhanes_data ggplot(aes(x=bp_sys_mean, color=cc_smoke)) +
geom_histogram(fill="white", alpha=0.5, bins= 80) +
scale_fill_viridis(discrete = TRUE, alpha=0.6) +
ggtitle("Blood Pressure with Smoking Histogram")
Warning: Removed 3111 rows containing non-finite outside the scale range
(`stat_bin()`).
%>% drop_na(cc_bmi) %>%
nhanes_data ggplot(aes(x=bp_sys_mean, color=cc_bmi)) +
geom_histogram(fill="white", alpha=0.5, bins = 80) +
ggtitle("Blood Pressure with BMI Histogram")
Warning: Removed 2867 rows containing non-finite outside the scale range
(`stat_bin()`).
%>% drop_na(cc_cvd_stroke) %>%
nhanes_data ggplot(aes(x=bp_sys_mean, color=cc_cvd_stroke)) +
geom_histogram(fill="white", alpha=0.5, bins = 80) +
ggtitle("Blood Pressure with Stroke Histogram")
Warning: Removed 3050 rows containing non-finite outside the scale range
(`stat_bin()`).
%>% drop_na(cc_diabetes) %>%
nhanes_data ggplot(aes(x=bp_sys_mean, color=cc_diabetes)) +
geom_histogram(fill="white", alpha=0.5, bins = 80) +
ggtitle("Blood Pressure with Diabetes Histogram")
Warning: Removed 3265 rows containing non-finite outside the scale range
(`stat_bin()`).
2.1.4 Chi Squared Tests (Testing difference between covariates and hypertension status)
I realize this isn’t very useful (of course the result will be significant) but I left them in anyways
#Body Mass Index
<- nhanes_data %>% select(htn_accaha, cc_bmi) %>% drop_na(cc_bmi)
nhanes_bmi_df <- table(nhanes_bmi_df$htn_accaha, nhanes_bmi_df$cc_bmi)
nhanes_bmi_df chisq.test(nhanes_bmi_df)
Pearson's Chi-squared test
data: nhanes_bmi_df
X-squared = 2576, df = 3, p-value < 2.2e-16
#Smoking
<- nhanes_data %>% select(htn_accaha, cc_smoke) %>% drop_na(cc_smoke)
nhanes_smoke_df <- table(nhanes_smoke_df$htn_accaha, nhanes_smoke_df$cc_smoke)
nhanes_smoke_df chisq.test(nhanes_smoke_df)
Pearson's Chi-squared test
data: nhanes_smoke_df
X-squared = 991.83, df = 2, p-value < 2.2e-16
#Chronic Heart Disease
<- nhanes_data %>% select(htn_accaha, cc_cvd_chd) %>% drop_na(cc_cvd_chd)
nhanes_chd_df <- table(nhanes_chd_df$htn_accaha, nhanes_chd_df$cc_cvd_chd)
nhanes_chd_df chisq.test(nhanes_chd_df)
Pearson's Chi-squared test with Yates' continuity correction
data: nhanes_chd_df
X-squared = 1357.9, df = 1, p-value < 2.2e-16
#Chronic Kidney Disease
<- nhanes_data %>% select(htn_accaha, cc_ckd) %>% drop_na(cc_ckd)
nhanes_ckd_df <- table(nhanes_ckd_df$htn_accaha, nhanes_ckd_df$cc_ckd)
nhanes_ckd_df chisq.test(nhanes_ckd_df)
Pearson's Chi-squared test with Yates' continuity correction
data: nhanes_ckd_df
X-squared = 3702.1, df = 1, p-value < 2.2e-16
2.1.5 Two Sample T Tests
(Testing the Difference in blood pressure between comorbidities)
Should I extend to demographic data?
#Don't know if this is the idea but:
<- nhanes_data %>% select(htn_accaha, bp_dia_mean) %>% drop_na(bp_dia_mean)
nhanes_dia_df <- t.test(bp_dia_mean ~ htn_accaha, data=nhanes_dia_df)
t_test t_test
Welch Two Sample t-test
data: bp_dia_mean by htn_accaha
t = -96.487, df = 45747, p-value < 2.2e-16
alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
95 percent confidence interval:
-9.443644 -9.067610
sample estimates:
mean in group No mean in group Yes
65.87311 75.12873
#Smoking T Test (Excludes Former):
<- nhanes_data[nhanes_data$cc_smoke != "Former", ] %>% select(cc_smoke, bp_sys_mean) %>% drop_na(bp_sys_mean) %>% drop_na(cc_smoke)
nhanes_sys_smoke <- t.test(bp_sys_mean ~ cc_smoke, data = nhanes_sys_smoke)
smoke_test smoke_test
Welch Two Sample t-test
data: bp_sys_mean by cc_smoke
t = 0.85479, df = 20379, p-value = 0.3927
alternative hypothesis: true difference in means between group Never and group Current is not equal to 0
95 percent confidence interval:
-0.2343734 0.5968840
sample estimates:
mean in group Never mean in group Current
123.6788 123.4975
#BMI T Test (Compares <25 and 35+):
<- nhanes_data[nhanes_data$cc_bmi %in% c("<25", "35+"), ] %>% select(cc_bmi, bp_sys_mean) %>% drop_na(bp_sys_mean) %>% drop_na(cc_bmi)
nhanes_sys_bmi <- t.test(bp_sys_mean ~ cc_bmi, data = nhanes_sys_bmi)
bmi_test bmi_test
Welch Two Sample t-test
data: bp_sys_mean by cc_bmi
t = -26.614, df = 18857, p-value < 2.2e-16
alternative hypothesis: true difference in means between group <25 and group 35+ is not equal to 0
95 percent confidence interval:
-6.963786 -6.008393
sample estimates:
mean in group <25 mean in group 35+
120.332 126.818
#Diabetes T Test:
<- nhanes_data %>% select(cc_diabetes, bp_sys_mean) %>% drop_na(bp_sys_mean) %>% drop_na(cc_diabetes)
nhanes_sys_diabetes <- t.test(bp_sys_mean ~ cc_diabetes, data = nhanes_sys_diabetes)
diabetes_test diabetes_test
Welch Two Sample t-test
data: bp_sys_mean by cc_diabetes
t = -40.314, df = 9221.3, p-value < 2.2e-16
alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
95 percent confidence interval:
-10.84351 -9.83791
sample estimates:
mean in group No mean in group Yes
122.7845 133.1252
2.2 Simple Diabetes Modeling
We are choosing Diabetes as the co-morbidity for modeling. We will try to predict the occurrence of diabetes as a function of blood pressure as well as other covariates.
The most basic model possible is a simple logistic regression model predicting diabetes status based on blood pressure:
<- glm(cc_diabetes ~ bp_sys_mean, data = nhanes_data, family = binomial)
simple_diab_model_sys
summary(simple_diab_model_sys)
Call:
glm(formula = cc_diabetes ~ bp_sys_mean, family = binomial, data = nhanes_data)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -4.9404827 0.0770324 -64.14 <2e-16 ***
bp_sys_mean 0.0238141 0.0005811 40.98 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 43614 on 56533 degrees of freedom
Residual deviance: 41987 on 56532 degrees of freedom
(3265 observations deleted due to missingness)
AIC: 41991
Number of Fisher Scoring iterations: 4
::kable(tidy(simple_diab_model_sys, exponentiate = TRUE, conf.int = TRUE)) knitr
term | estimate | std.error | statistic | p.value | conf.low | conf.high |
---|---|---|---|---|---|---|
(Intercept) | 0.0071511 | 0.0770324 | -64.13512 | 0 | 0.0061477 | 0.0083151 |
bp_sys_mean | 1.0240999 | 0.0005811 | 40.98151 | 0 | 1.0229344 | 1.0252673 |
<- glm(cc_diabetes ~ bp_dia_mean, data = nhanes_data, family = binomial)
simple_diab_model_dia
summary(simple_diab_model_dia)
Call:
glm(formula = cc_diabetes ~ bp_dia_mean, family = binomial, data = nhanes_data)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.652673 0.073728 -22.416 < 2e-16 ***
bp_dia_mean -0.003651 0.001039 -3.514 0.000441 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 43309 on 56285 degrees of freedom
Residual deviance: 43297 on 56284 degrees of freedom
(3513 observations deleted due to missingness)
AIC: 43301
Number of Fisher Scoring iterations: 4
::kable(tidy(simple_diab_model_dia, exponentiate = TRUE, conf.int = TRUE)) knitr
term | estimate | std.error | statistic | p.value | conf.low | conf.high |
---|---|---|---|---|---|---|
(Intercept) | 0.1915372 | 0.0737285 | -22.415661 | 0.0000000 | 0.1657332 | 0.2212722 |
bp_dia_mean | 0.9963553 | 0.0010390 | -3.514294 | 0.0004409 | 0.9943280 | 0.9983860 |
2.2.0.1 Model Explanation:
For the logistic regression using systolic blood pressure, we can see that the odds ratio beta parameter is 1.024. This implies that a higher systolic blood pressure corresponds to increased chances of observing diabetes.
A more exact interpretation would be that: for one additional unit increase in systolic blood pressure, the estimated risk of having diabetes increases by 2.4%.
This value may not seem extremely high, yet it still has a p value evaluated to be 0 by R. We can also see that the 95% confidence interval of this parameter ranges between 1.0229 and 1.0253, meaning that the model predicts that the true relationship falls within this interval with a 95% certainty, since the true value of the parameter is most likely within this interval.
The opposite is true with diastolic blood pressure: the beta parameter is 0.996, implying that higher diastolic blood pressure corresponds to a lower chance of observing diabetes. For an additional unit increase in diastolic blood pressure, the estimated risk of having diabetes decreases by 0.04%.
Note: systolic blood pressure is a better predictor of cardiovascular disease, so I chose to consider systolic rather than diastolic in future models.
2.3 Demographic Model
Next, we build a model including the demographic variables as well. This will hopefully provide a more accurate model, since it will have access to the further information in order to make predictions.
<- glm(cc_diabetes ~ bp_sys_mean + demo_age_years + demo_race + demo_gender,data = nhanes_data, family = binomial)
demo_diab_model_sys
summary (demo_diab_model_sys)
Call:
glm(formula = cc_diabetes ~ bp_sys_mean + demo_age_years + demo_race +
demo_gender, family = binomial, data = nhanes_data)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -5.6927561 0.0885174 -64.312 < 2e-16 ***
bp_sys_mean 0.0037581 0.0006792 5.533 3.14e-08 ***
demo_age_years 0.0529545 0.0009161 57.806 < 2e-16 ***
demo_raceNon-Hispanic Black 0.9034242 0.0350353 25.786 < 2e-16 ***
demo_raceNon-Hispanic Asian 0.8093210 0.0606852 13.336 < 2e-16 ***
demo_raceHispanic 0.8592775 0.0343392 25.023 < 2e-16 ***
demo_raceOther 0.7478892 0.0718834 10.404 < 2e-16 ***
demo_genderWomen -0.1547613 0.0265581 -5.827 5.63e-09 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 43614 on 56533 degrees of freedom
Residual deviance: 37780 on 56526 degrees of freedom
(3265 observations deleted due to missingness)
AIC: 37796
Number of Fisher Scoring iterations: 5
::kable(tidy(demo_diab_model_sys, exponentiate = TRUE, conf.int = TRUE)) knitr
term | estimate | std.error | statistic | p.value | conf.low | conf.high |
---|---|---|---|---|---|---|
(Intercept) | 0.0033703 | 0.0885174 | -64.312318 | 0 | 0.0028326 | 0.0040077 |
bp_sys_mean | 1.0037652 | 0.0006792 | 5.533450 | 0 | 1.0024283 | 1.0051008 |
demo_age_years | 1.0543817 | 0.0009161 | 57.805744 | 0 | 1.0524962 | 1.0562830 |
demo_raceNon-Hispanic Black | 2.4680396 | 0.0350353 | 25.786107 | 0 | 2.3042427 | 2.6434926 |
demo_raceNon-Hispanic Asian | 2.2463822 | 0.0606852 | 13.336382 | 0 | 1.9925692 | 2.5278493 |
demo_raceHispanic | 2.3614540 | 0.0343392 | 25.023206 | 0 | 2.2077734 | 2.5259187 |
demo_raceOther | 2.1125361 | 0.0718834 | 10.404206 | 0 | 1.8320999 | 2.4286788 |
demo_genderWomen | 0.8566196 | 0.0265581 | -5.827267 | 0 | 0.8131556 | 0.9023806 |
2.3.0.1 Model Explanation
In this model, we can still see a slight upwards trend with systolic blood pressure, although it is a bit weaker (1.024 > 1.003). With this parameter, the estimated risk of having diabetes increases only by 0.3%. Although the effect is smaller, we can see that the confidence interval is between 1.0024 and 1.0051, which does not include 1. So, although small, the positive relationship is most likely significant.
2.3.0.2 Demographics
Age:
The model also tells us that age is positively correlated with the observation of diabetes. With a log odds ratio of 1.054, it seems that every year increases the risk of diabetes occurrence by 5.4%.
Race:
The log odds ratio for categorical variables compares the other variables to the baseline “reference group”. Here, the “reference group” for the race variable is Non-Hispanic White. We can see that, in comparison to this group, all other races have significantly higher risk of diabetes: for example, the Hispanic population’s risk of diabetes occurrence is 2.36 times higher than the Non-Hispanic White population.
Gender:
With this variable, the reference group is men. We can see that women have a lower odds of having diabetes compared to men. Specifically, the risk of diabetes in women is about 85.6% of in men.
2.4 Full Model
Lastly, we want to create a model including confounding variables. To achieve this, we must first determine which other variables are confounding, meaning they have significant associations with diabetes.
We can achieve this by using chi squared test, which compares whether or not the observed ratios of diabetes and confounding variables matches the ratios expected by random chance. If the chi squared test determines significance, then diabetes and the other variables commonly occur together and therefore are confounding.
2.4.0.1 Perform Chi Squared Tests:
#Testing for BMI
<- nhanes_data %>% select(cc_diabetes, cc_bmi) %>% drop_na(cc_bmi) %>% drop_na(cc_diabetes)
nhanes_bmi_diab <- table(nhanes_bmi_diab)
nhanes_bmi_diab nhanes_bmi_diab
cc_bmi
cc_diabetes <25 25 to <30 30 to <35 35+
No 17390 16913 9476 6965
Yes 974 2173 2056 2274
chisq.test(nhanes_bmi_df)
Pearson's Chi-squared test
data: nhanes_bmi_df
X-squared = 2576, df = 3, p-value < 2.2e-16
#Testing for Smoking
<- nhanes_data %>% select(cc_diabetes, cc_smoke) %>% drop_na(cc_smoke) %>% drop_na(cc_diabetes)
nhanes_smoke_diab <- table(nhanes_smoke_diab)
nhanes_smoke_diab nhanes_smoke_diab
cc_smoke
cc_diabetes Never Former Current
No 27414 11107 10366
Yes 3917 2587 1249
chisq.test(nhanes_smoke_diab)
Pearson's Chi-squared test
data: nhanes_smoke_diab
X-squared = 435.81, df = 2, p-value < 2.2e-16
#Testing for Hypertensive Medication use
<- nhanes_data %>% select(cc_diabetes, bp_med_use) %>% drop_na(bp_med_use) %>% drop_na(cc_diabetes)
nhanes_med_diab <- table(nhanes_med_diab)
nhanes_med_diab nhanes_med_diab
bp_med_use
cc_diabetes No Yes
No 41679 10079
Yes 3138 4611
chisq.test(nhanes_med_diab)
Pearson's Chi-squared test with Yates' continuity correction
data: nhanes_med_diab
X-squared = 5807.1, df = 1, p-value < 2.2e-16
After this analysis, we can see that all three variables, BMI, Smoking, and Medication Use, are all confounding. We can include all of them in the next model, in addition to systolic blood pressure and the demographic variables:
#to prevent errors down the line, exclude the rows with na:
= c('cc_diabetes', 'bp_sys_mean', 'demo_age_years', 'demo_race', 'demo_gender', 'cc_bmi', 'cc_smoke', 'bp_med_use')
used_vars <- nhanes_data[complete.cases(nhanes_data[,..used_vars]), ]
clean_nhanes
<- glm(cc_diabetes ~ bp_sys_mean + demo_age_years + demo_race + demo_gender+ cc_bmi + cc_smoke + bp_med_use, data = clean_nhanes, family = binomial)
full_diab_model_sys
summary(full_diab_model_sys)
Call:
glm(formula = cc_diabetes ~ bp_sys_mean + demo_age_years + demo_race +
demo_gender + cc_bmi + cc_smoke + bp_med_use, family = binomial,
data = clean_nhanes)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -6.3003463 0.1112295 -56.643 < 2e-16 ***
bp_sys_mean 0.0017066 0.0007373 2.315 0.0206 *
demo_age_years 0.0470354 0.0011312 41.579 < 2e-16 ***
demo_raceNon-Hispanic Black 0.6987360 0.0374745 18.646 < 2e-16 ***
demo_raceNon-Hispanic Asian 1.3022903 0.0660355 19.721 < 2e-16 ***
demo_raceHispanic 0.8889260 0.0366792 24.235 < 2e-16 ***
demo_raceOther 0.8913458 0.0763522 11.674 < 2e-16 ***
demo_genderWomen -0.3082492 0.0295124 -10.445 < 2e-16 ***
cc_bmi25 to <30 0.5927344 0.0444524 13.334 < 2e-16 ***
cc_bmi30 to <35 1.1736075 0.0463565 25.317 < 2e-16 ***
cc_bmi35+ 1.7702868 0.0480051 36.877 < 2e-16 ***
cc_smokeFormer 0.0614704 0.0335729 1.831 0.0671 .
cc_smokeCurrent 0.1853436 0.0403687 4.591 4.41e-06 ***
bp_med_useYes 0.9121351 0.0314064 29.043 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 41278 on 52196 degrees of freedom
Residual deviance: 33055 on 52183 degrees of freedom
AIC: 33083
Number of Fisher Scoring iterations: 6
::kable(tidy(full_diab_model_sys, exponentiate = TRUE, conf.int = TRUE)) knitr
term | estimate | std.error | statistic | p.value | conf.low | conf.high |
---|---|---|---|---|---|---|
(Intercept) | 0.0018357 | 0.1112295 | -56.642782 | 0.0000000 | 0.0014752 | 0.0022815 |
bp_sys_mean | 1.0017080 | 0.0007373 | 2.314675 | 0.0206307 | 1.0002594 | 1.0031545 |
demo_age_years | 1.0481591 | 0.0011312 | 41.579121 | 0.0000000 | 1.0458438 | 1.0504919 |
demo_raceNon-Hispanic Black | 2.0112090 | 0.0374745 | 18.645645 | 0.0000000 | 1.8687584 | 2.1644762 |
demo_raceNon-Hispanic Asian | 3.6777100 | 0.0660355 | 19.721057 | 0.0000000 | 3.2286139 | 4.1827136 |
demo_raceHispanic | 2.4325157 | 0.0366792 | 24.235131 | 0.0000000 | 2.2638469 | 2.6139232 |
demo_raceOther | 2.4384090 | 0.0763522 | 11.674130 | 0.0000000 | 2.0964731 | 2.8282006 |
demo_genderWomen | 0.7347322 | 0.0295124 | -10.444726 | 0.0000000 | 0.6934113 | 0.7784570 |
cc_bmi25 to <30 | 1.8089279 | 0.0444524 | 13.334145 | 0.0000000 | 1.6585185 | 1.9742572 |
cc_bmi30 to <35 | 3.2336368 | 0.0463565 | 25.316997 | 0.0000000 | 2.9536973 | 3.5423428 |
cc_bmi35+ | 5.8725376 | 0.0480051 | 36.877031 | 0.0000000 | 5.3470462 | 6.4542286 |
cc_smokeFormer | 1.0633990 | 0.0335729 | 1.830951 | 0.0671079 | 0.9956021 | 1.1356446 |
cc_smokeCurrent | 1.2036319 | 0.0403687 | 4.591266 | 0.0000044 | 1.1117990 | 1.3024331 |
bp_med_useYes | 2.4896325 | 0.0314064 | 29.042966 | 0.0000000 | 2.3411270 | 2.6478476 |
2.4.0.2 Model Explanation
The previous variables in this model continue to show the same patterns, although again, the effect of systolic blood pressure is again reduced due to a lower log odds ratio. Now, the p value seems to be much higher at 0.02 (which is still significant, but much less so). The demographic variables also have much the same pattern, although Non-Hispanic Asians seem to have a much higher odds ratio than in the previous model.
2.4.0.3 Covariates
BMI:
In this variable, the baseline reference level is bmi <25. In comparison, those with higher bmi have increased risk of having diabetes, with the risk increasing the higher the bmi becomes. For example, those with bmi from 25-30 have a 1.8 times higher risk, while those with a bmi of 35+ have a 5.8 times higher risk.
Smoking:
With a baseline level of nonsmoking, former smokers may have about a 6% higher risk of diabetes. This may not be significant, however, given that the p value is only 0.067 and the odds ratio of 1 falls within the confidence interval. Current smokers, on the other hand, have about a 20% higher risk, this time with a much lower p value of 4*10^-6
Blood Pressure Medication Use:
Compared to those who do not use blood pressure medication, individuals who do have about a 2.5 times higher risk of experiencing diabetes.
2.5 Model Selection
It is possible to find an even better model by modifying which variables we include in our logistic regression model. We can achieve this using model selection, which will selectively add or subtract our explanatory variables from the model and evaluate its performance to choose the final best set of variables.
Explanation of model selection and or AIC?
2.5.0.1 Backwards Selection:
= step(full_diab_model_sys) backwards
Start: AIC=33082.55
cc_diabetes ~ bp_sys_mean + demo_age_years + demo_race + demo_gender +
cc_bmi + cc_smoke + bp_med_use
Df Deviance AIC
<none> 33055 33083
- bp_sys_mean 1 33060 33086
- cc_smoke 2 33076 33100
- demo_gender 1 33164 33190
- bp_med_use 1 33909 33935
- demo_race 4 33930 33950
- cc_bmi 3 34756 34778
- demo_age_years 1 34931 34957
formula(backwards)
cc_diabetes ~ bp_sys_mean + demo_age_years + demo_race + demo_gender +
cc_bmi + cc_smoke + bp_med_use
It looks like the backwards selection model returned the same full model. In other words, removing variables does not improve its AIC, and therefore does not improve its performance.
2.5.0.2 Forward Selection
#start from a model without any predictors
<- glm(cc_diabetes ~ 1, data = clean_nhanes, family=binomial)
nothing summary(nothing)
Call:
glm(formula = cc_diabetes ~ 1, family = binomial, data = clean_nhanes)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.85919 0.01282 -145.1 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 41278 on 52196 degrees of freedom
Residual deviance: 41278 on 52196 degrees of freedom
AIC: 41280
Number of Fisher Scoring iterations: 4
= step(nothing,
forwards scope=list(lower=formula(nothing),upper=formula(full_diab_model_sys)), direction="forward")
Start: AIC=41279.9
cc_diabetes ~ 1
Df Deviance AIC
+ bp_med_use 1 37076 37080
+ demo_age_years 1 37252 37256
+ cc_bmi 3 39218 39226
+ bp_sys_mean 1 39888 39892
+ demo_race 4 40891 40901
+ cc_smoke 2 40911 40917
+ demo_gender 1 41242 41246
<none> 41278 41280
Step: AIC=37079.85
cc_diabetes ~ bp_med_use
Df Deviance AIC
+ demo_age_years 1 35675 35681
+ cc_bmi 3 35907 35917
+ demo_race 4 36597 36609
+ bp_sys_mean 1 36788 36794
+ cc_smoke 2 36968 36976
+ demo_gender 1 37009 37015
<none> 37076 37080
Step: AIC=35681.48
cc_diabetes ~ bp_med_use + demo_age_years
Df Deviance AIC
+ cc_bmi 3 34070 34082
+ demo_race 4 34834 34848
+ demo_gender 1 35624 35632
+ bp_sys_mean 1 35654 35662
+ cc_smoke 2 35670 35680
<none> 35675 35681
Step: AIC=34082.43
cc_diabetes ~ bp_med_use + demo_age_years + cc_bmi
Df Deviance AIC
+ demo_race 4 33213 33233
+ demo_gender 1 33963 33977
+ bp_sys_mean 1 34050 34064
+ cc_smoke 2 34054 34070
<none> 34070 34082
Step: AIC=33232.52
cc_diabetes ~ bp_med_use + demo_age_years + cc_bmi + demo_race
Df Deviance AIC
+ demo_gender 1 33081 33103
+ cc_smoke 2 33170 33194
+ bp_sys_mean 1 33208 33230
<none> 33213 33233
Step: AIC=33102.89
cc_diabetes ~ bp_med_use + demo_age_years + cc_bmi + demo_race +
demo_gender
Df Deviance AIC
+ cc_smoke 2 33060 33086
+ bp_sys_mean 1 33076 33100
<none> 33081 33103
Step: AIC=33085.88
cc_diabetes ~ bp_med_use + demo_age_years + cc_bmi + demo_race +
demo_gender + cc_smoke
Df Deviance AIC
+ bp_sys_mean 1 33055 33083
<none> 33060 33086
Step: AIC=33082.55
cc_diabetes ~ bp_med_use + demo_age_years + cc_bmi + demo_race +
demo_gender + cc_smoke + bp_sys_mean
The Forward Selection process again selects the full model.
= step(nothing, list(lower=formula(nothing),upper=formula(full_diab_model_sys)),
bothways direction="both",trace=0)
formula(bothways)
cc_diabetes ~ bp_med_use + demo_age_years + cc_bmi + demo_race +
demo_gender + cc_smoke + bp_sys_mean
The Forward-Backward algorithm also selects all of the variables
2.6 Model Diagnostics
I followed a guide online but I don’t really know what this is doing:
%>%
clean_nhanes mutate(comp_res = coef(full_diab_model_sys)["demo_age_years"]*demo_age_years + residuals(full_diab_model_sys, type = "working")) %>%
ggplot(aes(x = demo_age_years, y = comp_res)) +
geom_point() +
geom_smooth(color = "red", method = "lm", linetype = 2, se = F) +
geom_smooth(se = F)
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
%>%
clean_nhanes mutate(comp_res = coef(full_diab_model_sys)["bp_sys_mean"]*bp_sys_mean + residuals(full_diab_model_sys, type = "working")) %>%
ggplot(aes(x = bp_sys_mean, y = comp_res)) +
geom_point() +
geom_smooth(color = "red", method = "lm", linetype = 2, se = F) +
geom_smooth(se = F)
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Test for Multicollinearity
vif(full_diab_model_sys)
GVIF Df GVIF^(1/(2*Df))
bp_sys_mean 1.169212 1 1.081301
demo_age_years 1.528167 1 1.236191
demo_race 1.207663 4 1.023866
demo_gender 1.091056 1 1.044536
cc_bmi 1.182024 3 1.028263
cc_smoke 1.173951 2 1.040908
bp_med_use 1.235766 1 1.111650
Outliers
<-
outlier_nhanes %>%
clean_nhanes mutate(dffits = dffits(full_diab_model_sys))
%>%
outlier_nhanes mutate(obs_number = row_number(),
large = ifelse(abs(dffits) > 2*sqrt(length(coef(full_diab_model_sys))/nobs(full_diab_model_sys)),
"red", "black")) %>%
ggplot(aes(obs_number, dffits, color = large)) +
geom_point() +
geom_hline(yintercept = c(-1,1) * 2*sqrt(length(coef(full_diab_model_sys))/nobs(full_diab_model_sys)), color = "red") +
scale_color_identity()