knitr::opts_chunk$set(comment=NA)
library(equatiomatic)
library(nhanesA)
library(janitor)
library(magrittr)
library(naniar)
library(car)
library(GGally)
library(ggrepel)
library(janitor)
library(magrittr)
library(patchwork)
library(broom)
library(mosaic)
library(tidyverse)
options(max.print="75")
theme_set(theme_bw())
options(dplyr.summarise.inform = FALSE)
Three NHANES databases
demo_raw <- nhanes('DEMO_J') %>% tibble()
Processing SAS dataset DEMO_J ..
hscrp_raw <- nhanes('HSCRP_J') %>% tibble()
Processing SAS dataset HSCRP_J ..
bmx_raw <- nhanes('BMX_J') %>% tibble()
Processing SAS dataset BMX_J ..
glu_raw <- nhanes('GLU_J') %>% tibble()
Processing SAS dataset GLU_J ..
bio_raw <- nhanes('BIOPRO_J') %>% tibble()
Processing SAS dataset BIOPRO_J ..
bpx_raw <- nhanes('BPX_J') %>% tibble()
Processing SAS dataset BPX_J ..
demo1 <- demo_raw %>%
select(SEQN, RIDSTATR, RIAGENDR, RIDRETH3, RIDAGEYR) %>%
filter(complete.cases(.))
dim(demo1)
[1] 9254 5
crp1 <- hscrp_raw %>%
select(SEQN, LBXHSCRP) %>%
filter(complete.cases(.))
dim(crp1)
[1] 7250 2
bmx1 <- bmx_raw %>%
select(SEQN, BMXWAIST) %>%
filter(complete.cases(.))
dim(bmx1)
[1] 7601 2
glu1 <- glu_raw %>%
select(SEQN, LBXGLU) %>%
filter(complete.cases(.))
dim(glu1)
[1] 2891 2
bio1 <- bio_raw %>%
select(SEQN, LBXSTR) %>%
filter(complete.cases(.))
dim(bio1)
[1] 5901 2
bpx1 <- bpx_raw %>%
select(SEQN, BPXSY1,BPXDI1) %>%
filter(complete.cases(.))
dim(bpx1)
[1] 6302 3
I have four tibbles now, which (once we have merged them together) will contain data on 2414 subjects for 11
Hmisc::label(crp1$SEQN) <- "Respondent sequence number"
Hmisc::label(bio1$SEQN) <- "Respondent sequence number"
temp1 <- left_join(demo1, crp1, by = "SEQN")
dim(temp1)
[1] 9254 6
temp2 <- left_join(temp1, bmx1, by = "SEQN")
dim(temp2)
[1] 9254 7
temp3 <- left_join(temp2, glu1, by = "SEQN")
dim(temp3)
[1] 9254 8
temp4 <- left_join(temp3, bio1, by = "SEQN")
dim(temp4)
[1] 9254 9
merged_data <- left_join(temp4, bpx1, by = "SEQN") %>%
filter(complete.cases(.))
dim(merged_data)
[1] 2414 11
In this first step of the cleaning process I have:
only included adults (ages 20-79)
renamed the raw variable names
converted sex
and ethnicity
to factors as well as provide their values with meaningful names
releveled ethnicity
so that “White” is the baseline category
converted SEQN
to a character variable
nh_clean <- merged_data %>%
filter(RIDAGEYR > 19 & RIDAGEYR < 80) %>% ## ages 20-79 only
rename(sex = RIAGENDR,
ethnicity = RIDRETH3,
crp = LBXHSCRP,
waist = BMXWAIST,
fast_glu=LBXGLU,
trg= LBXSTR,
sbp = BPXSY1,
dbp = BPXDI1) %>%
mutate(sex=as.factor(sex)) %>%
mutate(sex = fct_recode(sex,
"male" ="1",
"female"="2")) %>%
mutate(ethnicity=as.factor(ethnicity)) %>%
mutate(ethnicity = fct_recode(ethnicity,
"MA" ="1",
"Latino"="2",
"White" ="3",
"Black"= "4",
"Asian"= "6",
"Other"= "7" )) %>%
mutate(ethnicity=fct_relevel(ethnicity,"White", "Black", "Asian", "MA", "Latino", "Other" )) %>%
mutate(SEQN = str_pad(SEQN, 5, pad = "0"))
I ran haven::zap_label() on my tibble as part of my cleaning process. This changes everything from “labeled” class to a more useful class
nh_clean <- haven::zap_label(nh_clean)
I will exclude implausible blood pressure of laboratory values:
SBP: <60 or >250 mm Hg
DBP: <40 or >140 mm Hg
Triglycerides <10 or >2,000 mg/dL
Fasting plasma glucose: <30 or >600 mg/dL
Waist circumference: ≤51 cm or ≥190 cm
CRP: I could not find any studies that excluded certain implausible values. Therefore, I only excluded values that are below the limit of detection (0.15 mg/L)
Hmisc::describe(nh_clean %>% select(sbp, dbp, trg, fast_glu, waist, crp, RIDAGEYR))
nh_clean %>% select(sbp, dbp, trg, fast_glu, waist, crp, RIDAGEYR)
7 Variables 1898 Observations
--------------------------------------------------------------------------------
sbp
n missing distinct Info Mean Gmd .05 .10
1898 0 63 0.999 124.8 20.65 100 104
.25 .50 .75 .90 .95
112 122 134 150 160
lowest : 78 86 88 90 92, highest: 204 206 210 216 224
--------------------------------------------------------------------------------
dbp
n missing distinct Info Mean Gmd .05 .10
1898 0 42 0.997 72.67 13.17 54 58
.25 .50 .75 .90 .95
66 72 80 88 92
lowest : 0 38 40 42 44, highest: 110 112 114 120 124
--------------------------------------------------------------------------------
trg
n missing distinct Info Mean Gmd .05 .10
1898 0 305 1 130.4 81.44 51.0 59.0
.25 .50 .75 .90 .95
76.0 110.0 154.0 218.0 272.3
lowest : 25 27 31 33 34, highest: 846 1058 1270 1531 2923
Value 0 50 100 150 200 250 300 350 400 450 500
Frequency 1 443 734 382 162 84 38 17 17 4 5
Proportion 0.001 0.233 0.387 0.201 0.085 0.044 0.020 0.009 0.009 0.002 0.003
Value 550 750 800 850 1050 1250 1550 2900
Frequency 4 1 1 1 1 1 1 1
Proportion 0.002 0.001 0.001 0.001 0.001 0.001 0.001 0.001
For the frequency table, variable is rounded to the nearest 50
--------------------------------------------------------------------------------
fast_glu
n missing distinct Info Mean Gmd .05 .10
1898 0 177 0.999 113.6 29.37 87 90
.25 .50 .75 .90 .95
96 104 115 143 173
lowest : 47 53 62 67 69, highest: 366 380 389 421 451
--------------------------------------------------------------------------------
waist
n missing distinct Info Mean Gmd .05 .10
1898 0 639 1 100.4 19.21 74.6 78.6
.25 .50 .75 .90 .95
88.0 98.8 111.1 122.7 130.8
lowest : 63.2 64.0 64.5 64.6 65.9, highest: 161.6 162.3 163.6 164.1 169.5
--------------------------------------------------------------------------------
crp
n missing distinct Info Mean Gmd .05 .10
1898 0 762 1 4.112 5.013 0.3700 0.4800
.25 .50 .75 .90 .95
0.8425 1.9700 4.3675 8.6100 13.5990
lowest : 0.11 0.16 0.17 0.18 0.19, highest: 77.79 87.45 90.51 91.36 109.81
--------------------------------------------------------------------------------
RIDAGEYR
n missing distinct Info Mean Gmd .05 .10
1898 0 60 1 48.95 18.95 23 26
.25 .50 .75 .90 .95
34 50 62 70 74
lowest : 20 21 22 23 24, highest: 75 76 77 78 79
--------------------------------------------------------------------------------
This is what I see:
No missing values for any of these quantitative variables
There were people with implausible dbp
values (<40 mm/Hg ) (n=11)
crp
values below the lower limit of detection (0.15 mg/L )
a triglyceride value > 2000 mg/ dL
SBP, fasting plasma glucose, and waist circumference all seemed like plausible values
It’s worth it to check to see that everyone has a sbp meaningfully larger than their dbp.
nh_clean %>% mutate(bp_diff = sbp - dbp) %>% select(SEQN, sbp, dbp, bp_diff) %>% slice_min(bp_diff, n = 3)
# A tibble: 5 x 4
SEQN sbp dbp bp_diff
<chr> <int> <int> <int>
1 95353 100 82 18
2 97533 104 84 20
3 98463 112 92 20
4 101626 104 84 20
5 102775 90 70 20
This looks fine.
Below I have excluded those with implausible dbp
, undetectable crp
, and implausible trg
nh_clean <- nh_clean %>%
filter(dbp>39) %>%
filter(crp >0.14) %>%
filter (trg <2000)
Background The National Health and Nutrition Examination Survey (NHANES) is a population-based, health survey of noninstitutionalized United States residents conducted by the National Center for Health Statistics of the CDC. NHANES uses a probability sampling method and oversamples for certain subgroups. Participants completed household surveys, which included demographic questions and medical history. Laboratory samples were collected and physical examinations were conducted.
I used data on adults age 20-79 from years 2017-2018. Further eligibility consisted of complete data on crp levels, waist circumference, ethnicity, sex, fasting plasma glucose, triglycerides, systolic blood pressure, and diastolic blood pressure.
A growing body of evidence suggests that metabolic syndrome is highly associated with inflammation, which is an independent risk factor for an acute coronary event. Studies suggest that as the criteria of metabolic syndrome increase, so does crp, which is a marker of inflammation. Waist circumference has continually shown to influence crp, and if decreased, then the crp will decrease as well. For men and women the criteria for metabolic syndrome include a waist circumference >40 cm and >35 cm, respectively.
QUESTION:
Using the 2017-2018 NHANES data, how well can I predict someone’s crp level from their waist circumference after adjusting for sex?
Does the prediction meaningfully improve after adjusting for metabolic syndrome criteria and ethnicity?
mosaic::favstats( ~RIDAGEYR, data = nh_clean) %>% select(n, min, max)
n min max
1880 20 79
1880 adults ages 20-79 participating in NHANES 2017-18 with complete data on the variables listed in the table below
Variable | Type | Description / Levels |
---|---|---|
SEQN |
ID | SEQN code (93711-102956) |
RIDSTATR |
Quant | Code 2 = Both interviewed and examined |
crp |
Quant | outcome (LBXHSCRP ) high sensitivity c reactive protein |
waist |
Quant | key predictor (BMXWAIST ) Waist Circumference, in centimeters |
fast_glu |
Quant | (LBXGLU ) Fasting Plasma Glucose, in mg/L |
trg |
Quant | (LBXSTR ) Triglycerides, in mg/dL |
sbp |
Quant | (BPXSY1 ) Systolic Blood Pressure, in mm/Hg |
dbp |
Quant | (BPXDI1 ) Diastolic Blood pressure, in mm/Hg |
RIDAGEYR |
Quant | Age |
sex |
Binary | (RIAGENDR ) male, female |
ethnicity |
6-Cat | (RIDRETH3 ) White, Black, Asian, Mexican American (MA), Latino, Other |
Below we can see that:
My data frame is a tibble
It has 1880 rows and 11 columns
1 character variables: SEQN
,
2 factor variables: sex
(2 levels) and ethnicity
(6 levels)
9 numeric variables: age
, crp
, waist
, fast_glu
, trg
, sbp
, dbp
, RIDSTAR
str(nh_clean)
tibble [1,880 × 11] (S3: tbl_df/tbl/data.frame)
$ SEQN : chr [1:1880] "93711" "93717" "93718" "93721" ...
$ RIDSTATR : int [1:1880] 2 2 2 2 2 2 2 2 2 2 ...
$ sex : Factor w/ 2 levels "male","female": 1 1 1 2 2 1 1 1 1 2 ...
$ ethnicity: Factor w/ 6 levels "White","Black",..: 3 1 2 4 1 4 5 5 2 1 ...
$ RIDAGEYR : int [1:1880] 56 22 45 60 60 20 52 26 72 73 ...
$ crp : num [1:1880] 0.82 0.8 0.69 3.04 0.41 0.75 1.59 5.26 2.03 0.64 ...
$ waist : num [1:1880] 86.6 86.2 77.5 113.2 89.7 ...
$ fast_glu : int [1:1880] 107 91 89 104 101 98 83 102 106 119 ...
$ trg : int [1:1880] 59 122 57 92 67 95 133 94 121 208 ...
$ sbp : int [1:1880] 108 116 128 132 116 114 154 120 166 126 ...
$ dbp : int [1:1880] 68 62 88 68 68 54 74 82 90 60 ...
Below I ran describe
from the Hmisc
package on my data.
Hmisc::describe(nh_clean)
nh_clean
11 Variables 1880 Observations
--------------------------------------------------------------------------------
SEQN
n missing distinct
1880 0 1880
lowest : 100001 100008 100015 100025 100026, highest: 99983 99984 99985 99989 99996
--------------------------------------------------------------------------------
RIDSTATR
n missing distinct Info Mean Gmd
1880 0 1 0 2 0
Value 2
Frequency 1880
Proportion 1
--------------------------------------------------------------------------------
sex
n missing distinct
1880 0 2
Value male female
Frequency 916 964
Proportion 0.487 0.513
--------------------------------------------------------------------------------
ethnicity
n missing distinct
1880 0 6
lowest : White Black Asian MA Latino, highest: Black Asian MA Latino Other
Value White Black Asian MA Latino Other
Frequency 600 432 276 284 180 108
Proportion 0.319 0.230 0.147 0.151 0.096 0.057
--------------------------------------------------------------------------------
RIDAGEYR
n missing distinct Info Mean Gmd .05 .10
1880 0 60 1 48.88 18.9 23 26
.25 .50 .75 .90 .95
34 50 62 70 74
lowest : 20 21 22 23 24, highest: 75 76 77 78 79
--------------------------------------------------------------------------------
crp
n missing distinct Info Mean Gmd .05 .10
1880 0 757 1 4.117 5.014 0.3800 0.4900
.25 .50 .75 .90 .95
0.8575 1.9800 4.3525 8.5560 13.5930
lowest : 0.16 0.17 0.18 0.19 0.20, highest: 77.79 87.45 90.51 91.36 109.81
--------------------------------------------------------------------------------
waist
n missing distinct Info Mean Gmd .05 .10
1880 0 637 1 100.4 19.2 74.89 78.69
.25 .50 .75 .90 .95
88.00 98.80 111.10 122.81 130.91
lowest : 63.2 64.0 64.5 64.6 65.9, highest: 161.6 162.3 163.6 164.1 169.5
--------------------------------------------------------------------------------
fast_glu
n missing distinct Info Mean Gmd .05 .10
1880 0 175 0.999 113.2 28.85 87.0 90.0
.25 .50 .75 .90 .95
96.0 104.0 115.0 141.1 171.0
lowest : 47 53 62 67 69, highest: 366 380 389 421 451
--------------------------------------------------------------------------------
trg
n missing distinct Info Mean Gmd .05 .10
1880 0 304 1 129.2 78.81 51.0 59.0
.25 .50 .75 .90 .95
76.0 110.0 154.0 218.0 272.1
lowest : 25 27 31 33 34, highest: 777 846 1058 1270 1531
--------------------------------------------------------------------------------
sbp
n missing distinct Info Mean Gmd .05 .10
1880 0 62 0.999 124.7 20.59 100 104
.25 .50 .75 .90 .95
112 122 134 150 160
lowest : 78 86 88 90 92, highest: 200 204 210 216 224
--------------------------------------------------------------------------------
dbp
n missing distinct Info Mean Gmd .05 .10
1880 0 40 0.997 73.05 12.62 56 58
.25 .50 .75 .90 .95
66 72 80 88 92
lowest : 40 42 44 46 48, highest: 110 112 114 120 124
--------------------------------------------------------------------------------
No Missing data for any of the variables
Quantitative variables have a sensible minimum and maximum value
crp
: 0.16-109.81 mg/L (plausible = <500)waist
: 63.2 to 169.5 cm (plausible = 51-190)fast_glu
: 47 to 451 (plausible = 30-600 )trg
: 25 to 1531 (plausible = 10-2000)sbp
: 78 to 224 (plausible = 60-250)dbp
: 40 to 124 (plausible = 40-140)I have checked both of my categorical variables, ethnicity
and sex
, for problems, and all levels have at least 108 subjects
All subjects have RIDSTATR = 2, and thus were included in both the questionnaire and examination data.
The age range for the subjects is 20-79
I am partitioning the clean data into a training model sample (70% of the data) and a model validation sample (the remaining 30%).
set.seed(1)
nh_train <- nh_clean %>%
slice_sample(prop = 0.70, replace=FALSE)
nh_test <-
anti_join(nh_clean, nh_train, by = "SEQN")
Below we can see that each SEQN in nh_clean
wound up in either in the training or the test sample.
c(nrow(nh_train), nrow(nh_test), nrow(nh_clean))
[1] 1316 564 1880
Below I have plotted the outcome, crp
for the training sample (n=1316) to evaluate normality of this outcome.
res <- mosaic::favstats(~ crp, data = nh_train)
bin_w <- 3
p1 <- ggplot(nh_train, aes(x = crp)) + geom_histogram(binwidth = bin_w,
fill = "salmon",
col = "white") + theme_light() +
stat_function(
fun = function(x) dnorm(x, mean = res$mean,
sd = res$sd) *res$n * bin_w,
col = "darkred", size = 1) +
labs(title = "Histogram with Normal fit",
x = "CRP (mg/L)", y = "# of SEQNs")
p2 <- ggplot(nh_train, aes(sample = crp)) + geom_qq(col = "salmon") + geom_qq_line(col = "black") + theme_light() +
labs(title = "Normal Q-Q plot",
y = "CRP (mg/L)")
p3 <- ggplot(nh_train, aes(x = "", y = crp)) + geom_violin() +
geom_boxplot(width = 0.2, fill = "salmon", outlier.color = "red") +
theme_light() +
coord_flip() +
labs(title = "Boxplot with Violin",
x = "", y = "CRP (mg/L)")
p1 + p2 - p3 + plot_layout(ncol = 1, height = c(3, 1)) + plot_annotation(title = "Distribution of CRP is highly right skewed", subtitle= "Training sample (N=1316) from NHANES 2017-2018 data")
We can see that there is considerable right skew, which means that I will need to evaluate transformations that might make a Normal model more plausible. Because the crp
data are right skewed and all the values are strictly positive, I will try taking a step “down” the ladder from power 1 to lower powers (square root, log, and inverse).
As explained above, since the crp
data are right skewed and all the values are strictly positive, I will try taking a step “down” the ladder.
Below I have plotted the square root, log, and inverse of the outcome, crp
p2 <- ggplot(nh_train, aes(sample = crp)) + geom_qq() + geom_qq_line(col = "red") + labs(title = "untransformed crp")
p3 <- ggplot(nh_train, aes(sample = log(crp))) + geom_qq() + geom_qq_line(col = "red") + labs(title = "log(crp)")
p4 <- ggplot(nh_train, aes(sample = sqrt(crp))) + geom_qq() + geom_qq_line(col = "red") + labs(title = "square root of crp")
p5 <- ggplot(nh_train, aes(sample = 1/crp)) + geom_qq() + geom_qq_line(col = "red") + labs(title = "inverse of crp")
( p2 + p3) / (p4 + p5) +
plot_annotation(title = "Evaluating transformations that step down the ladder (crp is right skewed)", subtitle = "nh_train sample (n=1316): log transformation is the most normally distributed")
Based on the plots above, log(crp
) appears the most normally distributed.
Here is another set of plots that show the beautiful normality of the log of crp
.
res <- mosaic::favstats(~ log(crp), data = nh_train)
bin_w <- 1
p1 <- ggplot(nh_train, aes(x = log(crp))) + geom_histogram(binwidth = bin_w,
fill = "salmon",
col = "white") + theme_light() +
stat_function(
fun = function(x) dnorm(x, mean = res$mean,
sd = res$sd) *res$n * bin_w,
col = "darkred", size = 1) +
labs(title = "Histogram with Normal fit",
x = "log(crp) [log(mg/L)]", y = "# of SEQNs")
p2 <- ggplot(nh_train, aes(sample = log(crp))) + geom_qq(col = "salmon") + geom_qq_line(col = "black") + theme_light() +
labs(title = "Normal Q-Q plot",
y = "log(crp) [log(mg/L)]")
p3 <- ggplot(nh_train, aes(x = "", y = log(crp))) + geom_violin() +
geom_boxplot(width = 0.2, fill = "salmon", outlier.color = "red") +
theme_light() +
coord_flip() +
labs(title = "Boxplot with Violin",
x = "", y = "log(crp) [log(mg/L)]")
p1 + p2 - p3 + plot_layout(ncol = 1, height = c(3, 1)) + plot_annotation(title = "Distribution of log(crp) is fairly normally distributed", subtitle= "Training sample (N=1316) from NHANES 2017-2018 data")
Below is a set of numerical summaries of each predictor variable in the training data
nh_train %>% select(-SEQN, -RIDSTATR, -RIDAGEYR, -crp ) %>%
mosaic::inspect()
Warning: `data_frame()` is deprecated as of tibble 1.1.0.
Please use `tibble()` instead.
This warning is displayed once every 8 hours.
Call `lifecycle::last_warnings()` to see where this warning was generated.
categorical variables:
name class levels n missing
1 sex factor 2 1316 0
2 ethnicity factor 6 1316 0
distribution
1 female (52.8%), male (47.2%)
2 White (30.3%), Black (23.2%) ...
quantitative variables:
name class min Q1 median Q3 max mean sd n
...1 waist numeric 63.2 87.6 98.65 110.7 169.5 100.26117 17.04897 1316
...2 fast_glu integer 62.0 97.0 104.00 115.0 421.0 113.73328 36.65189 1316
...3 trg integer 25.0 76.0 111.00 155.0 1531.0 130.12690 92.88164 1316
...4 sbp integer 86.0 112.0 122.00 134.0 224.0 125.20973 18.93841 1316
...5 dbp integer 40.0 66.0 74.00 80.0 124.0 73.35562 11.64277 1316
missing
...1 0
...2 0
...3 0
...4 0
...5 0
sex
: 52.8% female & 47.2% male
ethnicity
: the majority of the subjects are White (30.3%) and Black (23.2%)
waist
(waist circumference) range: 63.2-169.5 cm, mean=100.3 cm, median = 98.7 cm
fast_glu
(fasting blood glucose): range: 62.0-421.0 mg/dL, mean=113.7 mg/dL, median=104 mg/dL
trg
(triglyceride levels): range 25.0-1531.0 mg/dL, mean=130, median=111 mg/dL
sbp
: range 86.0 -224.0 mmHg, median =122 mm /Hg
dbp
: 40.0 -1234 mm Hg, median = 74 mm/Hg
I have created two sets of scatterplot matrices to evaluate the shapes of the quantitative predictors and the relationships of the predictors with the transformed outcome (log_crp
)
nh_train %>%
mutate(log_crp= log(crp)) %>%
select(waist, fast_glu, trg, log_crp) %>%
ggpairs(., title = "Scatterplot Matrix: 3/7 predictors & the transformed outcome (log crp) (N=1316)",
lower = list(combo = wrap("facethist", bins = 20)))
nh_train %>%
mutate(log_crp= log(crp)) %>%
select(sbp, dbp, ethnicity, sex, log_crp) %>%
ggpairs(., title = "Scatterplot Matrix: remaining 4 predictors & the transformed outcome (log crp)",
lower = list(combo = wrap("facethist", bins = 10)))
Relationships
moderate positive relationship between log(crp)
and waist
weak positive relationship of log(crp) with: fast_glu
, trg
, and sbp
essentially no relationship between log(crp)
and dbp
no clear distinction with ethnicity
, except Asian
appears to have a lower median log(crp)
males appear to have a lower median log(crp)
than females
Distribution
waist
: fairly normally distributed, but slight right skew
fast_glu
: not normally distributed, some extreme high outliers causing right skew
trg
: not normally distributed, lack of left tail behavior, extreme high outliers causing right skew
sbp
: more normally distributed than some other key predictors, right skew
dbp
: most normal distribution of all predictors.
The ethnicity
predictor had more than 4 levels, thus it was difficult to observe any sort of patterns across the races.
ggplot(nh_train, aes(x = ethnicity, y = log(crp))) +
geom_violin() +
geom_boxplot(aes(fill = ethnicity), width = 0.3, notch = TRUE) + guides(fill = FALSE) +
labs(title = "Distribution of log(crp) by race category in adults", subtitle ="Training sample (N=1316) from NHANES 2017-2018 data", x= "Ethnicity", y = "log(crp)")
Interpretation:
There is no overlap between the notches for each of the six categories, so we might reasonably conclude that the true median log of crp across the six categories are statistically significantly different.
A fairly normal distribution is observed among the White and Black ethnicity category
The Asian category looks fairly scrunched up
There are many outliers in the Mexican American group
The distribution in the Latino category almost looks bimodal
The Other category lacks a left tail
I want to see strong correlations between the outcome, crp
, and the predictors, waist
, fast_glu
, trg
, sbp
, dbp
, ethnicity
, and sex
. However, I hope to only see relatively modest correlations between the predictors.
There was a modest correlation between sbp and dbp seen in the scatterplot matrix (r=0.52), however I don’t think this is high enough to be a problem.
The boxCox function confirms my desire to transform the outcome using a log transformation.
mod_0 <- lm(crp ~ waist + fast_glu + trg + sbp + dbp + ethnicity + sex , data = nh_train)
boxCox(mod_0)
Suggesting a transformation: suggesting 0 (log)
summary(powerTransform(mod_0))
bcPower Transformation to Normality
Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
Y1 -0.1427 -0.14 -0.1829 -0.1025
Likelihood ratio test that transformation parameter is equal to 0
(log transformation)
LRT df pval
LR test, lambda = (0) 50.68336 1 1.0854e-12
Likelihood ratio test that no transformation is needed
LRT df pval
LR test, lambda = (1) 3756.611 1 < 2.22e-16
My kitchen sink model will contain all 7 predictors.
The “kitchen sink” model predicts the log of crp
using the predictors waist
, fast_glu
, trg
, sbp
, dbp
, ethnicity
, and sex
.
model_big <- lm(log(crp) ~ waist + fast_glu + trg + sbp + dbp + ethnicity + sex,
data = nh_train)
summary(model_big)
Call:
lm(formula = log(crp) ~ waist + fast_glu + trg + sbp + dbp +
ethnicity + sex, data = nh_train)
Residuals:
Min 1Q Median 3Q Max
-3.2845 -0.6801 -0.0425 0.5721 4.7478
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.0717464 0.2426607 -12.659 < 2e-16 ***
waist 0.0309267 0.0016935 18.262 < 2e-16 ***
fast_glu 0.0015977 0.0007729 2.067 0.03893 *
trg 0.0003370 0.0003093 1.090 0.27605
sbp -0.0003464 0.0016736 -0.207 0.83606
dbp 0.0035853 0.0026807 1.337 0.18131
ethnicityBlack 0.1642485 0.0741643 2.215 0.02696 *
ethnicityAsian -0.2340307 0.0860088 -2.721 0.00659 **
ethnicityMA 0.0468092 0.0830302 0.564 0.57301
ethnicityLatino 0.1692856 0.0942094 1.797 0.07258 .
ethnicityOther 0.1864934 0.1185346 1.573 0.11589
sexfemale 0.3838528 0.0535985 7.162 1.33e-12 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.9508 on 1304 degrees of freedom
Multiple R-squared: 0.2924, Adjusted R-squared: 0.2864
F-statistic: 48.98 on 11 and 1304 DF, p-value: < 2.2e-16
tidy(model_big, conf.int = TRUE, conf.level = 0.90) %>%
select(term, estimate, std.error, conf.low, conf.high, p.value) %>%
knitr::kable(digits = c(0, 4, 4, 3, 3, 2))
term | estimate | std.error | conf.low | conf.high | p.value |
---|---|---|---|---|---|
(Intercept) | -3.0717 | 0.2427 | -3.471 | -2.672 | 0.00 |
waist | 0.0309 | 0.0017 | 0.028 | 0.034 | 0.00 |
fast_glu | 0.0016 | 0.0008 | 0.000 | 0.003 | 0.04 |
trg | 0.0003 | 0.0003 | 0.000 | 0.001 | 0.28 |
sbp | -0.0003 | 0.0017 | -0.003 | 0.002 | 0.84 |
dbp | 0.0036 | 0.0027 | -0.001 | 0.008 | 0.18 |
ethnicityBlack | 0.1642 | 0.0742 | 0.042 | 0.286 | 0.03 |
ethnicityAsian | -0.2340 | 0.0860 | -0.376 | -0.092 | 0.01 |
ethnicityMA | 0.0468 | 0.0830 | -0.090 | 0.183 | 0.57 |
ethnicityLatino | 0.1693 | 0.0942 | 0.014 | 0.324 | 0.07 |
ethnicityOther | 0.1865 | 0.1185 | -0.009 | 0.382 | 0.12 |
sexfemale | 0.3839 | 0.0536 | 0.296 | 0.472 | 0.00 |
Intercept
size: -3.0717 log(mg/L)
magnitude: this is fairly large in absolute value and negative.
meaning: this is the value of log(crp) when the waist circumference is 0, the sex is neither male nor female, and the person was dead (i.e. zero fasting blood glucose, triglycerides, blood pressure values) and is not of any ethnicity or sex. Clearly, this is meaningless as this is not possible.
effect sizes: The 90% CI is 3.4712 - 2.6723. It does not cross 0, so this is significant
slope of waist
size: 0.0309 mg/L
magnitude: this is in the positive direction and it appears fairly small, but so is 1 cm. Therefore, I think it is appropriate.
meaning: for every 1 cm increase in waist size, the log of crp will increase by 0.0309 mg/L (holding fast_glu
, trg
, sbp
, dbp
, ethnicity
and sex
constant )
effect sizes: The 90% CI around this estimate is 0.0281 to 0.0337. This is significant because it does not cross 0, thus there is not zero value in adding this to our prediction equation. It is also fairly tight and therefore precise. The range is meaningful. Also, this means that if we were to repeat this method 100 times, 90 of those confidence intervals would contain the true slope. We don’t know if this is one of the 90 or not.
slope of fasting glucose (fast_glu)
size: 0.0016 mg/L
magnitude: this is in the positive direction and it appears fairly small, but so is 1 mg/dL in relation. Therefore, I think it is appropriate.
meaning: for every 1 mg/dL increase in fasting glucose, the log of crp will increase by 0.0016 mg/L (holding waist
, trg
, sbp
, dbp
, ethnicity
and sex
constant)
effect sizes: The 90% CI around this estimate is 0.0003 to 0.0029. This is significant because it does not cross 0, thus there is not zero value in adding this to our prediction equation. It is also fairly tight and therefore precise. The range is meaningful. Also, this means that if we were to repeat this method 100 times, 90 of those confidence intervals would contain the true slope. We don’t know if this is one of the 90 or not.
slope of trg (trg)
size: 0.0003 mg/L
magnitude: this is in the positive direction and it appears fairly small, but so is 1 mg/dL in relation. Therefore, I think it is appropriate.
meaning: for every 1 mg/dL increase in triglycerides, the log of crp will increase by 0.0003 mg/L. (holding waist
, fast_glu
, sbp
, dbp
, ethnicity
and sex
constant)
effect sizes: The 90% CI around this estimate is -0.0002 to 0.0008. This is not significant because it crosses 0, thus there is no value in adding this to our prediction equation. It is also fairly wide and therefore not precise. The range is not meaningful. Also, this means that if we were to repeat this method 100 times, 90 of those confidence intervals would contain the true slope. We don’t know if this is one of the 90 or not.
slope of sbp
size: -0.0003 mg/L
magnitude: this is in the negative direction and it appears comparatively small.
meaning: for every 1 mg/ Hg increase in systolic blood pressure, the log of crp will decrease by 0.0003 mg/L (holding waist
, fast_glu
, trg
, dbp
, ethnicity
and sex
constant) This is contrary to what has previously been found with crp’s relation to the components of the factors contributing to metabolic syndrome. Previously it has been shown that as each component of the metabolic syndrome equation increases, so does crp.
effect sizes: The 90% CI around this estimate is -0.0031 to 0.0024. This is not significant because it crosses 0, thus there is no value in adding this to our prediction equation. It is also fairly wide and therefore not precise. The range is not meaningful. Also, this means that if we were to repeat this method 100 times, 90 of those confidence intervals would contain the true slope. We don’t know if this is one of the 90 or not.
slope of dbp
size: 0.0036 mg/L
magnitude: this is in the positive direction and it appears comparatively small.
meaning: for every 1 mg/ Hg increase in diastolic blood pressure, the log of crp will increase by 0.0036 mg/L (holding waist
, fast_glu
, trg
, sbp
, ethnicity
and sex
constant)
effect sizes: The 90% CI around this estimate is -0.0008 to 0.008. This is not significant because it crosses 0, thus there is no value in adding this to our prediction equation. It is also fairly wide and therefore not precise. The range is not meaningful. Also, this means that if we were to repeat this method 100 times, 90 of those confidence intervals would contain the true slope. We don’t know if this is one of the 90 or not.
Coefficients for ethnicity:
waist
, fast_glu
, trg
, sbp
, dbp
, and sex
. Similarly, relative to Whites, given equivalent waist
, fast_glu
, trg
, sbp
, dbp
, and sex
, Blacks, Mexican Americans, Latinos, and Other ethnicities have on average 0.1642, 0.0468, 0.0468, 0.1693, and 0.1865 greater log of crp. The magnitude of these differences from white race seem small-moderate, but make the biggest impact for the Black ethnicity. This is consistent with previous literature which suggests that on average, Blacks have a greater CRP than other races. Furthermore, a large percentage of crp is determined by genetics.We are 90% confident that the true coefficients for each of these ethnicities are:
Black: 0.0422 to 0.2863
Asian: 0.3756 to -0.0925
Mexican American: 0.0899 to 0.1835
Latino: 0.0142 to 0.3244
Other ethnicity: 0.0086 to 0.3816
According to the 90% confidence intervals these estimates provide very little value. The 90% CI cross 0 for Mexican Americans and Other race, which may deem them of no use. Furthermore, the confidence intervals for all of these are incredibly wide, thus not precise, and of very little use. If the true estimate is contained within one of these confidence intervals (a 90% chance that they do), it is either so wide that its not very helpful or some contain 0 which means they’re really not helpful.
Coefficients for sex:
waist
, fast_glu
, trg
, sbp
, dbp
, and ethnicity
. The magnitude of this difference is moderate. This makes sense because when considering if someone has metabolic syndrome, the waist size is dependent on if the sex is male or female.The equation of the big kitchen sink model is as follows:
extract_eq(model_big, use_coefs = TRUE, coef_digits = 4,
terms_per_line = 3, wrap = TRUE, ital_vars = TRUE)
\[ \begin{aligned} log(crp) &= -3.0717 + 0.0309(waist) + 0.0016(fast\_glu)\ + \\ &\quad 3e-04(trg) - 3e-04(sbp) + 0.0036(dbp)\ + \\ &\quad 0.1642(ethnicity_{Black}) - 0.234(ethnicity_{Asian}) + 0.0468(ethnicity_{MA})\ + \\ &\quad 0.1693(ethnicity_{Latino}) + 0.1865(ethnicity_{Other}) + 0.3839(sex_{female})\ + \\ &\quad \epsilon \end{aligned} \]
I will build a second linear regression model using a subset of the “kitchen sink” model predictors, chosen to maximize predictive value within nh_train
.
Here, I will just include the key predictor, waist
, and adjust for sex
. I did not want to use the stepwise regression method to determine which predictors because although this seems like a “sexy” (as Dr. Love phrases it), it doesn’t work well. Thus, I only used my key predictor and adjusted for sex because in the metabolic syndrome equation, evaluation of the waist size is dependent upon sex. Also, we can see in the box plot that there is a fairly decent and meaningful separation in the log of crp between sexes.
model_small <- lm((log(crp)) ~ waist + sex, data = nh_train)
Below are the coefficients of the small model:
tidy_m1 <- tidy(model_small, conf.int = TRUE, conf.level = 0.90)
tidy_m1 %>%
select(term, estimate, std.error, p.value,
conf.low, conf.high) %>% knitr::kable(digits = 4)
term | estimate | std.error | p.value | conf.low | conf.high |
---|---|---|---|---|---|
(Intercept) | -2.8583 | 0.1627 | 0 | -3.1262 | -2.5904 |
waist | 0.0337 | 0.0016 | 0 | 0.0312 | 0.0363 |
sexfemale | 0.3622 | 0.0531 | 0 | 0.2747 | 0.4496 |
intercept
size: -2.8583 log(mg/L)
magnitude: this is fairly large in the negative direction
meaning: this is the value of log(crp) when the waist circumference is 0 and the sex is neither male nor female. Clearly, this is meaningless as this is not possible.
effect sizes: The 90% CI is -3.1262 -2.5904. It does not cross 0, so this is significant
slope of waist
size: 0.0337 mg/L
magnitude: this is in the positive direction and it appears fairly small, but so is 1 cm. Therefore, I think it is appropriate.
meaning: for every 1 cm increase in waist size, the log of crp will increase by 0.0337 mg/L (holding sex
constant)
effect sizes: The 90% CI around this estimate is 0.0312 to 0.0363. This is significant because it does not cross 0, thus there is not zero value in adding this to our prediction equation. It is also fairly tight and therefore precise. The range is meaningful. Also, this means that if we were to repeat this method 100 times, 90 of those confidence intervals would contain the true slope. We don’t know if this is one of the 90 or not.
slope of sex
size: 0.3622 mg/L
magnitude: this is in the positive direction and it appears comparatively large.
meaning: If the sex of a person is female, the log of crp will be 0.3622 mg/L higher than males (holding waist
constant). Sex appears to be a fairly meaningful predictor for somebody’s log crp due to the size in magnitude difference.
effect sizes: The 90% confidence interval around this estimate is 0.2747 to 0.4496. This is significant because it does not cross 0, thus there is not zero value in adding this to our prediction equation. It is not that tight and therefore it doesn’t have high precision.
Below is my small model equation
extract_eq(model_small, use_coefs = TRUE, coef_digits = 4,
terms_per_line = 3, wrap = TRUE, ital_vars = TRUE)
\[ \begin{aligned} (log(crp)) &= -2.8583 + 0.0337(waist) + 0.3622(sex_{female})\ + \\ &\quad \epsilon \end{aligned} \]
I will compare the small model to the big model in nh_train
using: adjusted R2, residual standard error, AIC and BIC.
temp_a <- glance(model_big) %>%
select(-logLik, -deviance) %>%
round(digits = 3) %>%
mutate(name = "big")
temp_b <- glance(model_small) %>%
select(-logLik, -deviance) %>%
round(digits = 3) %>%
mutate(name = "small")
training_comp <- bind_rows(temp_a, temp_b) %>%
select(name, r.squared, adj.r.squared, sigma, AIC, BIC) %>%
knitr::kable(digits = c(0, 3, 3, 3, 0, 0))
training_comp
name | r.squared | adj.r.squared | sigma | AIC | BIC |
---|---|---|---|---|---|
big | 0.292 | 0.286 | 0.951 | 3616 | 3683 |
small | 0.275 | 0.274 | 0.959 | 3630 | 3651 |
It looks like the bigger model with 7 predictors: age
, crp
, waist
, fast_glu
, trg
, sbp
, dbp
, sex
, and ethnicity
, performs slightly better in the training sample.
Here, I will run a set of residual plots for each model in order to evaluate the regression assumptions:
Linearity
Constant variance
Normality
NOTE: this is cross sectional data, therefore the data are independent
Linearity: The residual vs. fitted plot has no pattern and no curve. Thus,there are no problems with linearity.
Constant variance: The residual vs. fitted plot does not have a fan or funnel shape. The scale location plot has an essentially straight loess line. Thus, the data are homoscedastic.
Normality: The normal Q-Q plot shows right skew & outliers. The residuals vs. leverage plot shows 3 standardized residuals above 4 standard deviations and 11 standardized residuals between 3-4 standard deviations (we only expect to see at most 4 points above 3 with a sample size of 1316) . This is not normal, especially with a data set of this size.
Independence: the residuals appear randomly scattered and show no patterns, trends, or clumps against the predicted values. Thus, the data are from a random sample.
Lastly, although this is not an assumption of linear regression, it is important to note that there are no highly influential points (all points have a cooks distance of <0.5)
aug1 <- augment(model_big, data = nh_train) %>%
mutate(log_crp = log(crp))
p1 <- ggplot(aug1, aes(x = .fitted, y = .resid)) +
geom_point() +
geom_point(data = aug1 %>%
slice_max(abs(.resid), n = 3),
col = "red", size = 1) +
geom_text_repel(data = aug1 %>%
slice_max(abs(.resid), n = 3),
aes(label = SEQN), col = "red") +
geom_abline(intercept = 0, slope = 0, lty = "dashed") +
geom_smooth(method = "loess", formula = y ~ x, se = F) +
labs(title = "Residuals vs. Fitted",
x = "Fitted Value of (log(crp))", y = "Residual")
p2 <- ggplot(aug1, aes(sample = .std.resid)) +
geom_qq() +
geom_qq_line(col = "red") +
labs(title = "Normal Q-Q plot",
y = "Standardized Residual",
x = "Standard Normal Quantiles")
p3 <- ggplot(aug1, aes(x = .fitted, y = sqrt(abs(.std.resid)))) +
geom_point() +
geom_smooth(method = "loess", formula = y ~ x, se = F) +
labs(title = "Scale-Location Plot",
x = "Fitted Value of (log(crp))",
y = "|Std. Residual|^(1/2)")
p4 <- ggplot(aug1, aes(x = .hat, y = .std.resid)) +
geom_point() +
geom_point(data = aug1 %>% filter(.cooksd >= 0.5),
col = "red", size = 2) +
geom_text_repel(data = aug1 %>% filter(.cooksd >= 0.5),
aes(label = SEQN), col = "red") +
geom_smooth(method = "loess", formula = y ~ x, se = F) +
geom_vline(aes(xintercept = 3*mean(.hat)), lty = "dashed") +
labs(title = "Residuals vs. Leverage",
x = "Leverage", y = "Standardized Residual")
(p1 + p2) / (p3 + p4) +
plot_annotation(title = "Assessing Residuals for the big model with the NHANES training sample (N=1316)",caption = "If applicable, Cook's d >= 0.5 shown in red in bottom right plot.")
Of note, outliers can be due to:
poorly fit: we can see those in the residuals vs. leverage plot where we have many more point than we would expect with a standardized residual >3.
they have unusual x values: we would can see those in the residuals vs leverage plot. We have several perhaps unusual predictor values (to the left dashed line which is 3 times the average leverage point)
it is highly influential: this combines part 1 and 2. if they are highly influential, if we take them out of the model, our coefficients will change. We can identify these if they have a cooks distance >0.5. We do not see any of those points in the residuals vs leverage plot.
Assumptions of linear regression
Linearity: no pattern to the residual vs fitted plot
Constant variance: no fan or funnel shape in the residual vs fitted plot & the loess line is essentially straight in the residual vs fitted plot
Normality: right skew in normal QQ plot with outliers.
Independence: The residuals appear to be randomly scattered and show no patterns, trends or clumps when plotted against the predicted values. Also, this is cross sectional data, thus it has to be independent.
aug2 <- augment(model_small, data = nh_train) %>%
mutate(log_crp = log(crp))
p1 <- ggplot(aug2, aes(x = .fitted, y = .resid)) +
geom_point() +
geom_point(data = aug2 %>%
slice_max(abs(.resid), n = 3),
col = "red", size = 1) +
geom_text_repel(data = aug2 %>%
slice_max(abs(.resid), n = 3),
aes(label = SEQN), col = "red") +
geom_abline(intercept = 0, slope = 0, lty = "dashed") +
geom_smooth(method = "loess", formula = y ~ x, se = F) +
labs(title = "Residuals vs. Fitted",
x = "Fitted Value of (log(crp))", y = "Residual")
p2 <- ggplot(aug2, aes(sample = .std.resid)) +
geom_qq() +
geom_qq_line(col = "red") +
labs(title = "Normal Q-Q plot",
y = "Standardized Residual",
x = "Standard Normal Quantiles")
p3 <- ggplot(aug2, aes(x = .fitted, y = sqrt(abs(.std.resid)))) +
geom_point() +
geom_smooth(method = "loess", formula = y ~ x, se = F) +
labs(title = "Scale-Location Plot",
x = "Fitted Value of (log(crp))",
y = "|Std. Residual|^(1/2)")
p4 <- ggplot(aug2, aes(x = .hat, y = .std.resid)) +
geom_point() +
geom_point(data = aug2 %>% filter(.cooksd >= 0.5),
col = "red", size = 2) +
geom_text_repel(data = aug2 %>% filter(.cooksd >= 0.5),
aes(label = SEQN), col = "red") +
geom_smooth(method = "loess", formula = y ~ x, se = F) +
geom_vline(aes(xintercept = 3*mean(.hat)), lty = "dashed") +
labs(title = "Residuals vs. Leverage",
x = "Leverage", y = "Standardized Residual")
(p1 + p2) / (p3 + p4) +
plot_annotation(title = "Assessing Residuals for the small model (2 predictors) in the training sample (n=1316)",
caption = "If applicable, Cook's d >= 0.5 shown in red in bottom right plot.")
NOTE: according to the residuals vs. leverage plot we do not have any highly influential points as no points have a cooks distance >0.5 (not an assumption of normality)
Collinearity does not have a meaningful impact ( no VIF values ≥5 )
car::vif(model_big)
GVIF Df GVIF^(1/(2*Df))
waist 1.212687 1 1.101221
fast_glu 1.167559 1 1.080537
trg 1.200327 1 1.095594
sbp 1.461356 1 1.208865
dbp 1.417064 1 1.190405
ethnicity 1.181637 5 1.016830
sex 1.042280 1 1.020921
-The models performed similarly with the training sample.
Neither showed meaningful deviations from the assumptions of linearity or constant variance, however both were slightly right skewed (due to some unusual predictor values).
Neither had highly influential points.
However, the bigger model had slightly better fit quality measures (adjusted r squared, sigma, and AIC).
Based on the training sample, my conclusion so far is to support the bigger model.
I will compare the two models in terms of mean squared prediction error and mean absolute prediction error.
I will use the augment function to create predictions within my new sample, however I need to back-transform the predictions so they are on the original scale (crp, rather than my transformed regression outcome log(crp)).
Since the way to back out of the log transformation is to apply a re-expression, I will do exp(fitted) and then calculate residuals on the original scale.
aug_big <- augment(model_big, newdata = nh_test) %>%
mutate(name = "big",
crp_fit = exp(.fitted),
crp_res = crp - crp_fit) %>%
select(SEQN, name, crp, crp_fit, crp_res, waist, fast_glu, trg, sbp, dbp, ethnicity, sex)
head(aug_big,3) %>%
knitr::kable(digits = 2)
SEQN | name | crp | crp_fit | crp_res | waist | fast_glu | trg | sbp | dbp | ethnicity | sex |
---|---|---|---|---|---|---|---|---|---|---|---|
93717 | big | 0.80 | 0.96 | -0.16 | 86.2 | 91 | 122 | 116 | 62 | White | male |
93718 | big | 0.69 | 0.92 | -0.23 | 77.5 | 89 | 57 | 128 | 88 | Black | male |
93721 | big | 3.04 | 3.51 | -0.47 | 113.2 | 104 | 92 | 132 | 68 | MA | female |
I did the same thing, but using the small model in the nh_test
data.
aug_small <- augment(model_small, newdata = nh_test) %>%
mutate(name = "small",
crp_fit = exp(.fitted),
crp_res = crp - crp_fit)
head(aug_small,3) %>%
select(SEQN, name, crp, crp_fit, crp_res, waist,sex) %>%
knitr::kable(digits = 2)
SEQN | name | crp | crp_fit | crp_res | waist | sex |
---|---|---|---|---|---|---|
93717 | small | 0.80 | 1.05 | -0.25 | 86.2 | male |
93718 | small | 0.69 | 0.78 | -0.09 | 77.5 | male |
93721 | small | 3.04 | 3.76 | -0.72 | 113.2 | female |
Below I have combined the back-transformed results for both the big and the small models
test_comp <- bind_rows(aug_big, aug_small) %>%
arrange(SEQN, name)
test_comp %>% select(name, SEQN, crp, crp_fit, crp_res, waist, fast_glu, trg, sbp, dbp, ethnicity, sex) %>%
slice(1:3, 7:9) %>%
knitr::kable(digits = c(0, 0, 1, 2, 2, 1, 0, 0))
name | SEQN | crp | crp_fit | crp_res | waist | fast_glu | trg | sbp | dbp | ethnicity | sex |
---|---|---|---|---|---|---|---|---|---|---|---|
big | 100008 | 8.1 | 1.84 | 6.26 | 101.1 | 101 | 90 | 126 | 72 | Asian | female |
small | 100008 | 8.1 | 2.50 | 5.60 | 101.1 | 101 | 90 | 126 | 72 | Asian | female |
big | 100015 | 1.4 | 1.04 | 0.40 | 87.0 | 99 | 78 | 114 | 76 | White | male |
big | 100047 | 1.7 | 1.47 | 0.23 | 90.7 | 102 | 136 | 156 | 92 | Black | male |
small | 100047 | 1.7 | 1.22 | 0.48 | 90.7 | 102 | 136 | 156 | 92 | Black | male |
big | 100061 | 0.9 | 0.97 | -0.04 | 80.2 | 105 | 115 | 112 | 68 | Asian | female |
The plot below indicates that the models perform similarly in their predictions.
ggplot(test_comp, aes(x = crp, y = crp_res,
group = name)) +
geom_line(aes(col = name)) +
geom_point(aes(col = name)) +
geom_label_repel(data = test_comp %>%
filter(SEQN == "97505"),
aes(label = SEQN)) +
labs(x = "Observed crp (mg/L)", y = "Prediction Errors on crp scale",
title = "Comparison of the errors made at each level of observed crp", subtitle = "big vs. small model using the test sample (N=564)", caption = "similar prediction errors")
There is no substantial separation, thus these are fairly similar results across both models.
The mean average prediction error (MAPE), square root of the mean squared prediction error (RMSPE), and maximum error were all slightly lower in the big model.
NOTE: predicting a crp within 2.5 mg/L is not useful.
Such a discrepancy could wrongly categorize someone who is at ‘intermediate risk’ of experiencing cardiovascular event as having a ‘high risk’ of experiencing a cardiovascular event.Obviously, missing by 48 is a terrible prediction. Ideally, we would like a prediction within a decimal place.
test_comp %>%
group_by(name) %>%
summarize(n = n(),
MAPE = mean(abs(crp_res)),
RMSPE = sqrt(mean(crp_res^2)),
max_error = max(abs(crp_res))) %>%
knitr:: kable(digits = c(0, 0, 2, 2, 2))
name | n | MAPE | RMSPE | max_error |
---|---|---|---|---|
big | 564 | 2.55 | 5.52 | 48.81 |
small | 564 | 2.59 | 5.57 | 48.82 |
Both models made their largest error on subject 97505 (observed crp
= 49.95 mg/L)
temp1 <- aug_big %>%
filter(abs(crp_res) == max(abs(crp_res)))
temp2 <- aug_small %>%
filter(abs(crp_res) == max(abs(crp_res)))
bind_rows(temp1, temp2) %>% select(SEQN, name, crp, crp_fit, crp_res) %>% knitr::kable(digits = c(0, 0, 2, 2, 2))
SEQN | name | crp | crp_fit | crp_res |
---|---|---|---|---|
97505 | big | 49.95 | 1.14 | 48.81 |
97505 | small | 49.95 | 1.13 | 48.82 |
The models preformed similarly out of sample. The bigger model had slightly lower MAPE, RMSPE, and maximum error. Thus, the big model wins here (but by a minuscule amount).
The big model did slightly better both in sample and out sample, but only by a minuscule amount. The performance in sample was better based on adjusted r squared, sigma, and AIC. The residual plots looked similar between the models, with their greatest weakness being adhering to the assumption of normality. Out of sample, the MAPE, RMPSE, and max error were so incredibly close for the two models, but they were slightly lower for the bigger model. Regardless, the errors were so huge (on average about 2.5 mg/L), that the models are not of any use.
I chose the small model because it was the most simple. The big model performed slightly better in sample and out of sample, but it was minuscule. Both models are useless in predicting crp
because they were off by 2.5 mg/L on average. Given almost equal wrongness, I would rather choose the one that is more simple and wrong than more complex and wrong.
Using the 2017 NHANES data set, and including all adults age 20-79 with complete and plausible data on variables of interest (N=1880), I was not able to predict someone’s crp level well based on their waist circumference and adjusting for sex. When the model was tested out of sample, on average, the predictions were off by about 2.5 mg/dL, which is not useful at all; furthermore, these predictions did not improve when ethnicity and other components of metabolic syndrome criteria were factored in.
My next steps:
Refine eligibility criteria: I did not have a valid study population for my research question. crp is highly unspecific and may be increased due to a variety of conditions that cause inflammation (e.g. cancer, infection, rheumatoid arthritis, etc.) My next steps might be to only include people who are at greater risk of experiencing a cardiovascular event (e.g. people with diabetes age ≥40) and who do not have a history of other conditions known to increase crp.
Missing at random: I should consider using imputation instead of missing completely at random (MCAR). MCAR is a huge assumption that is essentially never true.
Serial crp measurements: A crp level >10 mg/dL indicates an acute inflammatory response. If a crp was >10 then it would have been nice to have a recheck in a week.
What I would have changed if I had known at the start:
Do not spend 7 hours trying to pick variables based on what will move the r squared the most
We only compared two models, so it really wasn’t worth the trouble investing all that time trying to see what variables could have the biggest impact. We aren’t being graded on our final r-squared.
I shouldn’t have even been using r squared to make that decision anyway. If I was going to go through that trouble again (which is pointless and silly) then I should have compared adjusted r squared, sigma, AIC, and BIC.
Have a more narrowly defined study population besides just NHANES adults.
Pick less, but more meaningful variables
sessionInfo()
R version 4.0.2 (2020-06-22)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Catalina 10.15.6
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] forcats_0.5.0 stringr_1.4.0 purrr_0.3.4 readr_1.3.1
[5] tidyr_1.1.2 tibble_3.0.3 tidyverse_1.3.0 mosaic_1.7.0
[9] Matrix_1.2-18 mosaicData_0.18.0 ggformula_0.9.4 ggstance_0.3.4
[13] lattice_0.20-41 dplyr_1.0.2 broom_0.7.0 patchwork_1.0.1
[17] ggrepel_0.8.2 GGally_2.0.0 ggplot2_3.3.2 car_3.0-9
[21] carData_3.0-4 naniar_0.5.2 magrittr_1.5 janitor_2.0.1
[25] nhanesA_0.6.5 equatiomatic_0.1.0
loaded via a namespace (and not attached):
[1] colorspace_1.4-1 ellipsis_0.3.1 rio_0.5.16
[4] visdat_0.5.3 leaflet_2.0.3 snakecase_0.11.0
[7] htmlTable_2.0.1 base64enc_0.1-3 ggdendro_0.1.21
[10] fs_1.5.0 rstudioapi_0.11 farver_2.0.3
[13] fansi_0.4.1 lubridate_1.7.9 xml2_1.3.2
[16] splines_4.0.2 knitr_1.29 polyclip_1.10-0
[19] Formula_1.2-3 jsonlite_1.7.0 cluster_2.1.0
[22] dbplyr_1.4.4 png_0.1-7 ggforce_0.3.2
[25] compiler_4.0.2 httr_1.4.2 backports_1.1.9
[28] assertthat_0.2.1 lazyeval_0.2.2 cli_2.0.2
[31] tweenr_1.0.1 htmltools_0.5.0 tools_4.0.2
[34] gtable_0.3.0 glue_1.4.2 Rcpp_1.0.5
[37] cellranger_1.1.0 vctrs_0.3.4 nlme_3.1-149
[40] crosstalk_1.1.0.1 xfun_0.16 openxlsx_4.1.5
[43] rvest_0.3.6 lifecycle_0.2.0 mosaicCore_0.6.0
[46] MASS_7.3-52 scales_1.1.1 hms_0.5.3
[49] RColorBrewer_1.1-2 yaml_2.2.1 curl_4.3
[52] gridExtra_2.3 rpart_4.1-15 reshape_0.8.8
[55] latticeExtra_0.6-29 stringi_1.4.6 highr_0.8
[58] checkmate_2.0.0 zip_2.1.1 rlang_0.4.7
[61] pkgconfig_2.0.3 evaluate_0.14 htmlwidgets_1.5.1
[64] labeling_0.3 tidyselect_1.1.0 plyr_1.8.6
[67] R6_2.4.1 generics_0.0.2 Hmisc_4.4-1
[70] DBI_1.1.0 pillar_1.4.6 haven_2.3.1
[73] foreign_0.8-80 withr_2.2.0 mgcv_1.8-33
[ reached getOption("max.print") -- omitted 15 entries ]