library(skimr)
library(knitr)
library(rbounds)
library(tableone)
library(magrittr); library(janitor)
library(broom); library(survival); library(lme4)
library(cobalt); library(Matching); library(MatchIt)
library(naniar)
library(simputation)
library(mice)
library(Epi)
library(twang)
library(survey)
library(knitr)
library(rbounds)
library(patchwork)
library(tidyverse)
decim <- function(x, k) format(round(x, k), nsmall=k)
options(max.print="250")
opts_knit$set(width=75)
theme_set(theme_bw())
sample <- read.csv("sample2.csv")
meps_clean <- sample %>%
mutate(dupersid=as.character(dupersid)) %>%
rename(RA=max_ra,
pneumo=max_pneu,
age=agelast) %>%
mutate(RA_f = fct_recode(factor(RA),
"Yes" = "1", "No" = "0")) %>%
mutate(pneumo_f = fct_recode(factor(pneumo),
"Yes" = "1", "No" = "0")) %>%
select(-ra, -pneu, -icd10cdx, -condn, -rxnum, -pid, -duid)
meps_clean <- meps_clean %>%
mutate(region = fct_recode(factor(region),
"Northeast" = "1", "Midwest" = "2", "South" = "3", "West" = "4",
NULL = "-1")) %>%
mutate(panel = as.factor(panel)) %>%
mutate(region = fct_infreq(region)) %>%
mutate(race=fct_recode(factor(racethx),
"Hispanic" = "1", "White" = "2", "Black" = "3", "Other" = "4", "Other" = "5")) %>%
mutate(race = fct_infreq(race)) %>%
mutate(sex = fct_recode(factor(sex),
"M" = "1", "F" = "2")) %>%
mutate(education =
fct_recode(factor(hideg),
"no_degree" = "1", "GED" = "2", "HS" = "3", "BA"= "4", "graduate" = "5", "graduate"= "6", "Other"="7", NULL = "-15", NULL = "-8", NULL = "-7", NULL = "8")) %>%
mutate(public_ins = fct_recode(factor(inscov),
"No" = "1", "Yes" = "2", "No"= "3")) %>%
mutate(public_ins = fct_relevel(public_ins, "Yes")) %>%
mutate(income = fct_recode(factor(povcat),
"Poor" = "1", "Near_poor" = "2", "Low" = "3", "Middle"= "4", "High"="5")) %>%
mutate(marital = fct_recode(factor(marry42x),
NULL = "-9", NULL= "-8", NULL = "-7", NULL = "-1", "married"= "1","married" = "7", "widowed"= "2", "widowed"= "8", "divorced" = "3", "divorced" = "9", "separated" = "10", "separated" = "4", "never_married" = "5", )) %>%
mutate(marital = fct_infreq(marital)) %>%
mutate(employment = fct_recode(factor(empst31),
"No" = "4", "Yes"= "3", "Yes"= "2", "Yes"="1", NULL = "-15", NULL = "-7", NULL = "-8", NULL = "-1", NULL= "-9"))
meps_clean <- meps_clean %>%
mutate(prim_care = fct_recode(factor(haveus42),
"Yes" = "1", "No" = "2", NULL = "-8", NULL = "-7",
NULL = "-1", NULL = "-9")) %>%
mutate(flu_vac = fct_recode(factor(flu_vac),
"Yes" = "1", "No" = "2", NULL = "-15", NULL = "-1", NULL = "-9")) %>%
mutate(asthma = fct_recode(factor(asthdx),
"Yes" = "1", "No" = "2", NULL = "-8", NULL = "-7", NULL = "-1", NULL = "-9")) %>%
mutate(asthma = fct_relevel(asthma, "Yes")) %>%
mutate(diabetes = fct_recode(factor(diabetes),
"Yes" = "1", "No" = "2", NULL = "-8", NULL = "-7", NULL = "-1", NULL = "-15", NULL = "-9")) %>%
mutate(diabetes = fct_relevel(diabetes, "Yes")) %>%
mutate(cancer = fct_recode(factor(cancerdx),
"Yes" = "1", "No" = "2", NULL = "-8", NULL = "-7", NULL = "-1", NULL = "-15", NULL = "-9")) %>%
mutate(cancer = fct_relevel(cancer, "Yes")) %>%
mutate(bronchitis = fct_recode(factor(chbron31),
"Yes" = "1", "No" = "2", NULL = "-8", NULL = "-7", NULL = "-1", NULL = "-15", NULL = "-9")) %>%
mutate(bronchitis = fct_relevel(bronchitis, "Yes")) %>%
mutate(emphysema = fct_recode(factor(emphdx),
"Yes" = "1", "No" = "2", NULL = "-8", NULL = "-7 ", NULL = "-1", NULL = "-15", NULL = "-9")) %>%
mutate(emphysema = fct_relevel(emphysema, "Yes")) %>%
mutate(stroke = fct_recode(factor(strkdx),
"Yes" = "1", "No" = "2", NULL = "-8", NULL = "-7 ", NULL = "-1", NULL = "-15", NULL = "-9")) %>%
mutate(stroke = fct_relevel(stroke, "Yes"))
meps_clean <- meps_clean %>%
mutate(smoke=factor(smoke)) %>%
mutate(perc_health = fct_recode(factor(rthlth31),
"Poor" = "5", "Fair" = "4", "Good"= "3", "VG_Exc"= "2", "VG_Exc"="1", NULL = "-1", NULL = "-7", NULL = "-8")) %>%
mutate(mental = fct_recode(factor(mnhlth31),
"Poor" = "5", "Fair" = "4", "Good"= "3", "VG"= "2", "Excellent"="1", NULL = "-1", NULL = "-7", NULL = "-8")) %>%
mutate(IADL = fct_recode(factor(iadlhp31),
"Yes" = "1", "No"= "2", NULL = "-8", NULL = "-7", NULL = "-1")) %>%
mutate(ADL = fct_recode(factor(adlhlp31),
"Yes" = "1", "No"= "2", NULL = "-8", NULL = "-7", NULL = "-1")) %>%
mutate(difficult_lift10 = fct_recode(factor(lftdif31),
"no" = "1", "some"= "2", "a_lot" = "3", "unable"= "4", NULL = "-8", NULL = "-7", NULL = "-1")) %>%
mutate(phys_lim = fct_recode(factor(wlklim31),
NULL = "-8", NULL= "-7", NULL = "-1", "yes"= "1","no" = "2")) %>%
mutate(cog_dif = fct_recode(factor(dfcog42),
NULL = "-8", NULL= "-7", NULL = "-1", "yes"= "1","no" = "2"))
meps_clean <- meps_clean %>%
mutate(highbp = fct_recode(factor(hibpdx),
"Yes" = "1", "No" = "2", NULL = "-8", NULL = "-7", NULL = "-1", NULL = "-9")) %>%
mutate(highbp = fct_relevel(highbp, "Yes")) %>%
mutate(chd = fct_recode(factor(chddx),
"Yes" = "1", "No" = "2", NULL = "-8", NULL = "-7", NULL = "-1", NULL = "-9")) %>%
mutate(chd = fct_relevel(chd, "Yes")) %>%
mutate(angina = fct_recode(factor(angidx),
"Yes" = "1", "No" = "2", NULL = "-8", NULL = "-7", NULL = "-1", NULL = "-9")) %>%
mutate(angina = fct_relevel(angina, "Yes")) %>%
mutate(MI = fct_recode(factor(midx),
"Yes" = "1", "No" = "2", NULL = "-8", NULL = "-7", NULL = "-1", NULL = "-9")) %>%
mutate(MI = fct_relevel(MI, "Yes")) %>%
mutate(other_heart = fct_recode(factor(ohrtdx),
"Yes" = "1", "No" = "2", NULL = "-8", NULL = "-7", NULL = "-1", NULL = "-9")) %>%
mutate(other_heart = fct_relevel(other_heart, "Yes")) %>%
mutate(high_chol = fct_recode(factor(choldx),
"Yes" = "1", "No" = "2", NULL = "-8", NULL = "-7", NULL = "-1", NULL = "-9")) %>%
mutate(high_chol = fct_relevel(high_chol, "Yes")) %>%
mutate(arthritis = fct_recode(factor(arthdx),
"Yes" = "1", "No" = "2", NULL = "-8", NULL = "-7", NULL = "-1", NULL = "-9")) %>%
mutate(arthritis = fct_relevel(arthritis, "Yes"))
meps_clean <- meps_clean %>%
select(dupersid, RA, RA_f, pneumo, pneumo_f, totexp, panel, age, sex, race, marital, region, income, education, public_ins, prim_care, flu_vac, asthma, diabetes, cancer, bronchitis, emphysema, stroke, perc_health, mental, employment, IADL, ADL, smoke, phys_lim, cog_dif, highbp, chd, angina, MI, other_heart, high_chol, arthritis)
The original MEPS data was collected prospectively over 2 years. For my study, there was no regard to time when the exposure,RA
and the outcome, pneumonia
were collected (i.e. which came before the other). The covariate information is from the beginning of the study period, so at the same time or before the information about medical conditions were obtained (3 in person interviews per year).
Information was obtained from
https://meps.ahrq.gov/data_stats/download_data/pufs/h209/h209doc.pdf
paste(colnames(meps_clean), collapse = " | ")
[1] "dupersid | RA | RA_f | pneumo | pneumo_f | totexp | panel | age | sex | race | marital | region | income | education | public_ins | prim_care | flu_vac | asthma | diabetes | cancer | bronchitis | emphysema | stroke | perc_health | mental | employment | IADL | ADL | smoke | phys_lim | cog_dif | highbp | chd | angina | MI | other_heart | high_chol | arthritis"
Variable | Type | Description |
---|---|---|
dupersid | character | subject identifier |
panel | factor (3 levels) | corresponds to the cohort subjects belong (21, 22, 23) |
RA | numeric | whether or not subject has rheumatoid arthritis (1=yes, 0=no). This will be the exposure variable. |
RA_f | binary | whether or not subject has rheumatoid arthritis (yes, no). This will be the exposure variable. |
pneumo | numeric | whether or not subject has pneumonia (1=yes, 0=no). This will be the outcome variable. |
pneumo_f | binary | whether or not subject has pneumonia (yes, no). This will be the outcome variable. |
totexp | numeric | total medical expenditure for the year |
age | integer | age in years |
sex | F/M | F = Female, M = Male |
race | factor (4 levels) | White, Hispanic, Black, Other |
marital | factor (5 levels) | marital status (married, never_married, divorced, widowed, separated) |
region | factor (4 levels) | Census Region (South, West, Midwest, Northeast) |
income | factor (5 levels) | Family income as a % of poverty line: Poor, Near_poor, Low, Middle, High |
education | factor (6 levels) | highest degree when first entered MEPS (no_degree, GED, HS, BA, graduate, Other) |
public_ins | binary | Whether or not has public insurance (Yes, No) |
prim_care | binary | Does a person have a usual care provider (Yes, No) |
flu_vac | binary | Flu vaccination in past 12 months (Yes, No) |
asthma | binary | Asthma diagnosis (Yes, No) |
diabetes | binary | Diabetes diagnosis (Yes, No) |
cancer | binary | Cancer diagnosis (Yes, No) |
bronchitis | binary | Bronchitis diagnosis (Yes, No) |
emphysema | binary | emphysema diagnosis (Yes, No) |
stroke | binary | history of stroke (Yes, No) |
perc_health | factor (4 levels) | Perceived health status (VG_Exc = collapsed very good/excellent, good, fair, poor) |
mental | factor (5 levels) | Perceived mental health status (excellent, very good, good, fair, poor) |
employment | binary | currently employed (Yes, No) |
IADL | binary | require help with Instrumental Activities of Daily Living (Yes, NO) |
ADL | binary | require help with Activities of Daily Living (Yes, NO) |
smoke | binary | Currently (Yes, No) |
phys_lim | binary | difficulty in performing certain specific physical actions such as walking, climbing stairs, grasping objects, reaching overhead, lifting, bending or stooping, or standing for long periods of time |
cog_dif | binary | difficulty concentrating, remembering or making decisions (Yes,No) |
highbp | binary | have you ever been told you have high blood pressure? (Yes, No) |
chd | binary | have you ever been diagnosed with coronary heart disease? (Yes, No) |
angina | binary | had ever been diagnosed as having angina, or angina pectoris (Yes, No) |
MI | binary | had ever been diagnosed as having a heart attack, or myocardial infarction (Yes, No) |
other_heart | binary | had ever been diagnosed with any other kind of heart disease or condition (Yes, No) |
high_chol | binary | had ever been diagnosed as having high cholesterol (Yes, No) |
arthritis | binary | had ever been diagnosed with arthritis (Yes, No) *this was removed b/c only 2 people in RA group had ‘no’ |
vars <- c('age' , 'sex' , 'race' , 'marital' , 'region' , 'income' , 'education' , 'public_ins' , 'prim_care' , 'flu_vac' , 'asthma' , 'diabetes' , 'cancer' , 'bronchitis' , 'emphysema' , 'stroke' , 'perc_health' , 'mental' , 'employment' , 'IADL' , 'ADL' , 'smoke' , 'phys_lim' , 'cog_dif' , 'highbp' ,'chd' , 'angina' , 'MI' , 'other_heart' , 'high_chol')
factorvars <- c('sex' , 'race' , 'marital' , 'region' , 'income' , 'education' , 'public_ins' , 'prim_care' , 'flu_vac' , 'asthma' , 'diabetes' , 'cancer' , 'bronchitis' , 'emphysema' , 'stroke' , 'perc_health' , 'mental' , 'employment' , 'IADL' , 'ADL' , 'smoke' , 'phys_lim' , 'cog_dif' , 'highbp' ,'chd' , 'angina' , 'MI' , 'other_heart' , 'high_chol')
trt <- c("RA_f")
table01 <- CreateTableOne(data = meps_clean,
vars = vars,
factorVars= factorvars,
strata = trt)
print(table01, verbose=TRUE)
Stratified by RA_f
No Yes p test
n 1986 516
age (mean (SD)) 52.34 (17.13) 61.50 (13.19) <0.001
sex = F (%) 1118 (56.3) 383 (74.2) <0.001
race (%) <0.001
White 1150 (57.9) 291 (56.4)
Hispanic 366 (18.4) 94 (18.2)
Black 290 (14.6) 108 (20.9)
Other 180 ( 9.1) 23 ( 4.5)
marital (%) <0.001
married 1061 (53.7) 256 (49.6)
never_married 428 (21.7) 51 ( 9.9)
divorced 267 (13.5) 97 (18.8)
widowed 158 ( 8.0) 82 (15.9)
separated 61 ( 3.1) 30 ( 5.8)
region (%) 0.006
South 763 (38.7) 241 (46.8)
West 485 (24.6) 108 (21.0)
Midwest 405 (20.6) 102 (19.8)
Northeast 317 (16.1) 64 (12.4)
income (%) <0.001
Poor 320 (16.1) 130 (25.2)
Near_poor 80 ( 4.0) 32 ( 6.2)
Low 309 (15.6) 83 (16.1)
Middle 542 (27.3) 127 (24.6)
High 735 (37.0) 144 (27.9)
education (%) <0.001
no_degree 243 (12.3) 116 (22.7)
GED 82 ( 4.2) 34 ( 6.7)
HS 816 (41.4) 226 (44.2)
BA 397 (20.1) 55 (10.8)
graduate 235 (11.9) 42 ( 8.2)
Other 200 (10.1) 38 ( 7.4)
public_ins = No (%) 1407 (70.8) 263 (51.0) <0.001
prim_care = No (%) 374 (19.4) 24 ( 4.7) <0.001
flu_vac = No (%) 456 (48.3) 100 (35.5) <0.001
asthma = No (%) 1702 (85.9) 409 (79.3) <0.001
diabetes = No (%) 1653 (83.4) 403 (78.1) 0.006
cancer = No (%) 1745 (88.1) 419 (81.2) <0.001
bronchitis = No (%) 1902 (97.1) 473 (92.0) <0.001
emphysema = No (%) 1926 (97.2) 460 (89.1) <0.001
stroke = No (%) 1870 (94.4) 458 (88.8) <0.001
perc_health (%) <0.001
VG_Exc 1003 (51.2) 88 (17.1)
Good 575 (29.4) 187 (36.4)
Fair 290 (14.8) 157 (30.5)
Poor 90 ( 4.6) 82 (16.0)
mental (%) <0.001
Excellent 636 (32.5) 109 (21.2)
VG 570 (29.1) 109 (21.2)
Good 547 (27.9) 173 (33.7)
Fair 161 ( 8.2) 100 (19.5)
Poor 44 ( 2.2) 23 ( 4.5)
employment = No (%) 822 (42.0) 360 (70.2) <0.001
IADL = No (%) 1858 (94.8) 426 (82.9) <0.001
ADL = No (%) 1902 (97.0) 457 (88.9) <0.001
smoke = yes (%) 285 (16.4) 113 (24.3) <0.001
phys_lim = no (%) 1587 (81.1) 221 (43.0) <0.001
cog_dif = no (%) 1802 (91.8) 425 (82.5) <0.001
highbp = No (%) 1115 (56.3) 191 (37.0) <0.001
chd = No (%) 1838 (92.9) 448 (87.0) <0.001
angina = No (%) 1921 (97.0) 485 (94.0) 0.002
MI = No (%) 1876 (94.7) 465 (90.1) <0.001
[ reached getOption("max.print") -- omitted 2 rows ]
Half of the sample is missing flu_vac
, so I will probably drop this as a covariate of interest.
Other variables we will be imputing are smoke, prim_car, phys_lim, employment, bronchitis, cog_dif, mental, per_health, IADL, ADL, region, education, marital, cancer, chd, angina, asthma, diabetes, emphysema, stroke, highbp, MI, other_heart, and high_chol
miss_var_summary(meps_clean)
# A tibble: 38 x 3
variable n_miss pct_miss
<chr> <int> <dbl>
1 flu_vac 1276 51.0
2 smoke 300 12.0
3 prim_care 70 2.80
4 employment 31 1.24
5 bronchitis 30 1.20
6 perc_health 30 1.20
7 mental 30 1.20
8 phys_lim 30 1.20
9 IADL 28 1.12
10 ADL 27 1.08
# … with 28 more rows
Here i am taking out flu vaccine
meps_no_flu <- meps_clean %>% select(-flu_vac)
I will let the mice
package, do all of the imputation for me, but instead of multiple imputation I will just do one. And then I will pull out that one simply imputed data set.
set.seed(432432)
meps_mice <- mice(meps_no_flu, m = 1, printFlag = FALSE)
Warning: Number of logged events: 3
I will now store the 1st imputed data set inmeps1718
meps1718 <- complete(meps_mice, 1) %>% tibble()
dim(meps1718)
[1] 2502 37
And I do not have any more missing!
n_miss(meps1718)
[1] 0
Ignoring covariates, I will estimate the effect of RA vs. no RA on the outcome, developing pneumonia.
Below is a 2 x 2 table which will give us the RR, OR, and risk difference.
Epi::twoby2(table(meps1718$RA_f, meps1718$pneumo_f)) %>% kable(dig=3)
2 by 2 table analysis:
------------------------------------------------------
Outcome : No
Comparing : No vs. Yes
No Yes P(No) 95% conf. interval
No 1962 24 0.9879 0.9820 0.9919
Yes 487 29 0.9438 0.9203 0.9607
95% conf. interval
Relative Risk: 1.0467 1.0244 1.0696
Sample Odds Ratio: 4.8681 2.8090 8.4366
Conditional MLE Odds Ratio: 4.8639 2.7070 8.8141
Probability difference: 0.0441 0.0263 0.0678
Exact P-value: 0.0000
Asymptotic P-value: 0.0000
------------------------------------------------------
|
|
|
Below is another method to obtain the OR, through logistic regression.
unadjust_binary_outcome <- glm(pneumo ~ RA, data = meps1718, family = binomial())
unadjust_binary_outcome_tidy <- tidy(unadjust_binary_outcome, conf.int = TRUE, conf.level = 0.95, exponentiate = TRUE) %>%
filter(term == "RA")
unadjust_binary_outcome_tidy %>% select(-p.value, -statistic) %>% kable(dig=3)
term | estimate | std.error | conf.low | conf.high |
---|---|---|---|---|
RA | 4.868 | 0.281 | 2.813 | 8.502 |
The odds of having pneumonia in RA individuals was 4.87 (95%CI: 2.81, 8.5) times higher than the odds that a non-RA control had pneumonia.
I will now fit the propensity score, which predicts RA status based on these 30 available covariates:
panel
, age
, sex
, race
, marital
, region
, income
, education
, public_ins
, prim_care
, asthma
, diabetes
, cancer
, bronchitis
, emphysema
, stroke
, perc_health
, mental
, employment
, IADL
, ADL
, smoke
, phys_lim
, cog_dif
, highbp
, chd
, angina
, MI
, other_heart
, high_chol
paste(colnames(meps1718), collapse = ", ")
[1] "dupersid, RA, RA_f, pneumo, pneumo_f, totexp, panel, age, sex, race, marital, region, income, education, public_ins, prim_care, asthma, diabetes, cancer, bronchitis, emphysema, stroke, perc_health, mental, employment, IADL, ADL, smoke, phys_lim, cog_dif, highbp, chd, angina, MI, other_heart, high_chol, arthritis"
We’ll use the f.build
tool from the cobalt
package here.
meps1718 <- meps1718 %>%
mutate(treat = as.logical(RA_f == "Yes"))
covs_1 <- meps1718 %>%
select(panel, age, sex, race, marital, region, income, education, public_ins, prim_care, asthma, diabetes, cancer, bronchitis, emphysema, stroke, perc_health, mental, employment, IADL, ADL, smoke, phys_lim, cog_dif, highbp, chd, angina, MI, other_heart, high_chol)
prop_model <- glm(f.build("treat", covs_1), data = meps1718,
family = binomial)
tidy(prop_model, conf.int = TRUE) %>%
select(term, estimate, std.error, conf.low, conf.high, p.value) %>%
knitr::kable(digits = 3)
term | estimate | std.error | conf.low | conf.high | p.value |
---|---|---|---|---|---|
(Intercept) | -3.054 | 0.711 | -4.454 | -1.664 | 0.000 |
panel22 | 0.003 | 0.140 | -0.270 | 0.278 | 0.983 |
panel23 | -0.144 | 0.162 | -0.463 | 0.173 | 0.374 |
age | 0.024 | 0.005 | 0.013 | 0.034 | 0.000 |
sexF | 0.844 | 0.132 | 0.589 | 1.105 | 0.000 |
raceHispanic | 0.097 | 0.176 | -0.250 | 0.440 | 0.579 |
raceBlack | 0.309 | 0.169 | -0.024 | 0.639 | 0.067 |
raceOther | -0.564 | 0.269 | -1.112 | -0.054 | 0.036 |
maritalnever_married | -0.653 | 0.205 | -1.063 | -0.257 | 0.001 |
maritaldivorced | -0.203 | 0.172 | -0.544 | 0.131 | 0.238 |
maritalwidowed | -0.388 | 0.194 | -0.772 | -0.010 | 0.046 |
maritalseparated | 0.114 | 0.287 | -0.459 | 0.670 | 0.692 |
regionWest | -0.083 | 0.158 | -0.394 | 0.225 | 0.598 |
regionMidwest | -0.165 | 0.158 | -0.476 | 0.142 | 0.295 |
regionNortheast | -0.201 | 0.181 | -0.561 | 0.150 | 0.267 |
incomeNear_poor | 0.088 | 0.272 | -0.453 | 0.617 | 0.746 |
incomeLow | -0.155 | 0.196 | -0.541 | 0.228 | 0.428 |
incomeMiddle | 0.065 | 0.190 | -0.307 | 0.439 | 0.732 |
incomeHigh | 0.283 | 0.208 | -0.124 | 0.693 | 0.174 |
educationGED | 0.142 | 0.279 | -0.411 | 0.684 | 0.609 |
educationHS | -0.189 | 0.170 | -0.521 | 0.145 | 0.265 |
educationBA | -0.606 | 0.230 | -1.060 | -0.159 | 0.008 |
educationgraduate | -0.335 | 0.254 | -0.837 | 0.158 | 0.187 |
educationOther | -0.563 | 0.254 | -1.069 | -0.071 | 0.027 |
public_insNo | -0.076 | 0.143 | -0.356 | 0.205 | 0.594 |
prim_careNo | -1.058 | 0.235 | -1.539 | -0.614 | 0.000 |
asthmaNo | -0.062 | 0.165 | -0.382 | 0.264 | 0.705 |
diabetesNo | 0.456 | 0.152 | 0.161 | 0.756 | 0.003 |
cancerNo | -0.140 | 0.160 | -0.451 | 0.176 | 0.382 |
bronchitisNo | -0.078 | 0.252 | -0.566 | 0.422 | 0.758 |
emphysemaNo | -0.385 | 0.246 | -0.869 | 0.098 | 0.118 |
strokeNo | 0.199 | 0.206 | -0.200 | 0.608 | 0.333 |
perc_healthGood | 1.105 | 0.164 | 0.787 | 1.429 | 0.000 |
perc_healthFair | 1.273 | 0.193 | 0.897 | 1.652 | 0.000 |
perc_healthPoor | 1.672 | 0.259 | 1.166 | 2.182 | 0.000 |
mentalVG | -0.193 | 0.168 | -0.523 | 0.135 | 0.249 |
mentalGood | -0.301 | 0.170 | -0.634 | 0.031 | 0.075 |
mentalFair | 0.034 | 0.223 | -0.405 | 0.471 | 0.878 |
mentalPoor | -0.398 | 0.351 | -1.097 | 0.283 | 0.257 |
employmentNo | 0.220 | 0.153 | -0.080 | 0.519 | 0.151 |
IADLNo | 0.019 | 0.234 | -0.440 | 0.480 | 0.936 |
ADLNo | -0.173 | 0.275 | -0.711 | 0.368 | 0.530 |
smokeyes | 0.438 | 0.153 | 0.136 | 0.737 | 0.004 |
phys_limno | -0.919 | 0.140 | -1.193 | -0.645 | 0.000 |
cog_difno | 0.186 | 0.188 | -0.180 | 0.559 | 0.322 |
highbpNo | -0.086 | 0.132 | -0.345 | 0.173 | 0.516 |
chdNo | 0.023 | 0.224 | -0.414 | 0.467 | 0.919 |
anginaNo | -0.046 | 0.293 | -0.614 | 0.536 | 0.876 |
MINo | -0.034 | 0.238 | -0.496 | 0.438 | 0.887 |
other_heartNo | 0.159 | 0.159 | -0.151 | 0.474 | 0.318 |
high_cholNo | 0.046 | 0.127 | -0.203 | 0.296 | 0.719 |
glance(prop_model)
# A tibble: 1 x 8
null.deviance df.null logLik AIC BIC deviance df.residual nobs
<dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <int>
1 2547. 2501 -989. 2079. 2376. 1977. 2451 2502
meps1718 <- meps1718 %>%
mutate(ps = prop_model$fitted,
linps = prop_model$linear.predictors)
ggplot(meps1718, aes(x = RA_f, y = linps)) +
geom_violin() +
geom_boxplot(width = 0.3)
The density plot below shows us that we have a substantial number of RA patients who do not have overlapping PS scores with non-RA patients.
ggplot(meps1718, aes(x = linps, fill = RA_f)) +
geom_density(alpha = 0.3)
The range of PS for the non-RA group is 0.0018 to 0.8409. The range for the RA group is 0.02 to 0.85.
Having a minimum this low is worrisome.
meps1718%$%
mosaic::favstats(ps ~ RA_f) %>% kable(dig=4)
Registered S3 method overwritten by 'mosaic':
method from
fortify.SpatialPolygonsDataFrame ggplot2
RA_f | min | Q1 | median | Q3 | max | mean | sd | n | missing |
---|---|---|---|---|---|---|---|---|---|
No | 0.0018 | 0.0429 | 0.0958 | 0.2204 | 0.8409 | 0.1597 | 0.1645 | 1986 | 0 |
Yes | 0.0221 | 0.2174 | 0.3862 | 0.5411 | 0.8521 | 0.3853 | 0.2004 | 516 | 0 |
Hmisc::describe shows us that there are many observations with a PS score of 0.002
Hmisc::describe(~ ps, data = meps1718)
ps
1 Variables 2502 Observations
--------------------------------------------------------------------------------
ps
n missing distinct Info Mean Gmd .05 .10
2502 0 2501 1 0.2062 0.2078 0.01289 0.02254
.25 .50 .75 .90 .95
0.05296 0.13242 0.31044 0.51550 0.62385
lowest : 0.001779336 0.001957033 0.002080588 0.002114288 0.002691743
highest: 0.832155102 0.839330830 0.840857492 0.851275383 0.852068436
--------------------------------------------------------------------------------
match_1
1:1 Greedy matching on the linear PSThe first type of match I will conduct is greedy 1:1 matching, without replacement. As we had only 1986 controls, we will not match all of the 516 RA patients.
I am using the Matching
package
I am defining our treat
(treatment) as occurring when RA_f
is yes.
match_1 <- Match(Tr = meps1718$treat, X = meps1718$linps,
M = 1, replace = FALSE, ties = FALSE,
estimand = "ATT")
summary(match_1)
Estimate... 0
SE......... 0
T-stat..... NaN
p.val...... NA
Original number of observations.............. 2502
Original number of treated obs............... 516
Matched number of observations............... 516
Matched number of observations (unweighted). 516
I am creating a new data frame, match1_matches
, for upcoming analyses. This will contain only the matched subjects.
match1_matches <- factor(rep(match_1$index.treated, 2))
meps1718_matched1 <- cbind(match1_matches,
meps1718[c(match_1$index.control,
match_1$index.treated),])
Some sanity checks:
meps1718_matched1 %>% count(RA_f)
RA_f n
1 No 516
2 Yes 516
meps1718_matched1 %>% head()
match1_matches dupersid RA RA_f pneumo pneumo_f totexp panel age sex race
1 1987 94379104 0 No 0 No 21156 22 85 F Other
2 1988 91142101 0 No 0 No 2400 22 71 F White
3 1989 2293255101 0 No 0 No 19626 22 70 F White
4 1990 90530101 0 No 0 No 1151 22 65 F White
5 1991 95795102 0 No 0 No 409 22 33 F White
6 1992 2295001101 0 No 0 No 6238 22 66 M White
marital region income education public_ins prim_care asthma diabetes
1 widowed South Near_poor no_degree Yes Yes No No
2 married South High HS Yes Yes No No
3 divorced Midwest Near_poor HS Yes Yes No No
4 divorced Midwest High HS No Yes No No
5 married South High BA No Yes No No
6 married South Middle HS Yes Yes No Yes
cancer bronchitis emphysema stroke perc_health mental employment IADL ADL
1 No No No No Good Good No Yes Yes
2 No No No No Fair Fair Yes No No
3 Yes Yes Yes No VG_Exc VG No Yes No
4 No No No No Good VG Yes No No
5 No No No No VG_Exc Excellent Yes No No
6 Yes No No No Poor Good No No No
smoke phys_lim cog_dif highbp chd angina MI other_heart high_chol arthritis
1 no yes no Yes No No No No No No
2 no no no No No No No No No No
3 no yes no No No No No No No Yes
4 no no no Yes No No No No No No
5 no no no No No No No Yes No Yes
6 no yes no Yes No No No Yes Yes Yes
treat ps linps
1 FALSE 0.57371174 0.29701130
2 FALSE 0.51210454 0.04842762
3 FALSE 0.42143501 -0.31688530
4 FALSE 0.29988929 -0.84782511
5 FALSE 0.05658893 -2.81368871
6 FALSE 0.40948396 -0.36609908
Looks good. I built a new matched sample data frame with only the matched subjects
bal.tab
to obtain a balance tablepaste(colnames(meps1718), collapse = ", ")
[1] "dupersid, RA, RA_f, pneumo, pneumo_f, totexp, panel, age, sex, race, marital, region, income, education, public_ins, prim_care, asthma, diabetes, cancer, bronchitis, emphysema, stroke, perc_health, mental, employment, IADL, ADL, smoke, phys_lim, cog_dif, highbp, chd, angina, MI, other_heart, high_chol, arthritis, treat, ps, linps"
covs_1plus <- meps1718 %>%
select(panel, age, sex, race, marital, region, income, education, public_ins, prim_care, asthma, diabetes, cancer, bronchitis, emphysema, stroke, perc_health, mental, employment, IADL, ADL, smoke, phys_lim, cog_dif, highbp, chd, angina, MI, other_heart, high_chol, ps, linps)
bal1 <- bal.tab(match_1,
treat = meps1718$RA,
covs = covs_1plus, quick = FALSE,
data = meps1718, stats = c("m", "v"),
un = TRUE, disp.v.ratio = TRUE)
bal1
Balance Measures
Type Diff.Un V.Ratio.Un Diff.Adj V.Ratio.Adj
panel_21 Binary -0.0044 . -0.0097 .
panel_22 Binary 0.0070 . 0.0136 .
panel_23 Binary -0.0026 . -0.0039 .
age Contin. 0.6945 0.5928 -0.0993 0.8479
sex_F Binary 0.1793 . 0.0252 .
race_White Binary -0.0151 . 0.0058 .
race_Hispanic Binary -0.0021 . -0.0058 .
race_Black Binary 0.0633 . -0.0019 .
race_Other Binary -0.0461 . 0.0019 .
marital_married Binary -0.0406 . -0.0155 .
marital_never_married Binary -0.1182 . 0.0213 .
marital_divorced Binary 0.0535 . 0.0097 .
marital_widowed Binary 0.0778 . -0.0213 .
marital_separated Binary 0.0274 . 0.0058 .
region_South Binary 0.0798 . 0.0194 .
region_West Binary -0.0359 . -0.0136 .
region_Midwest Binary -0.0073 . 0.0097 .
region_Northeast Binary -0.0366 . -0.0155 .
income_Poor Binary 0.0908 . 0.0213 .
income_Near_poor Binary 0.0217 . -0.0116 .
income_Low Binary 0.0053 . 0.0116 .
income_Middle Binary -0.0268 . 0.0019 .
income_High Binary -0.0910 . -0.0233 .
education_no_degree Binary 0.1034 . -0.0019 .
education_GED Binary 0.0265 . 0.0136 .
education_HS Binary 0.0290 . 0.0039 .
education_BA Binary -0.0953 . 0.0019 .
education_graduate Binary -0.0360 . -0.0174 .
education_Other Binary -0.0276 . 0.0000 .
public_ins_No Binary -0.1988 . -0.0155 .
prim_care_No Binary -0.1459 . 0.0058 .
asthma_No Binary -0.0664 . -0.0388 .
diabetes_No Binary -0.0528 . -0.0039 .
cancer_No Binary -0.0686 . 0.0136 .
bronchitis_No Binary -0.0503 . -0.0039 .
emphysema_No Binary -0.0808 . -0.0252 .
stroke_No Binary -0.0560 . 0.0097 .
perc_health_VG_Exc Binary -0.3410 . -0.0155 .
perc_health_Good Binary 0.0668 . -0.0271 .
perc_health_Fair Binary 0.1597 . 0.0116 .
perc_health_Poor Binary 0.1145 . 0.0310 .
mental_Excellent Binary -0.1120 . 0.0078 .
mental_VG Binary -0.0789 . -0.0155 .
mental_Good Binary 0.0563 . -0.0213 .
mental_Fair Binary 0.1112 . 0.0329 .
mental_Poor Binary 0.0233 . -0.0039 .
employment_No Binary 0.2792 . -0.0136 .
IADL_No Binary -0.1215 . -0.0271 .
ADL_No Binary -0.0817 . -0.0233 .
smoke_yes Binary 0.0780 . 0.0291 .
[ reached 'max' / getOption("max.print") -- omitted 10 rows ]
Sample sizes
Control Treated
All 1986 516
Matched 516 516
Unmatched 1470 0
We’ll build a little table of the Rubin’s Rules (1 and 2) before and after our match_1
is applied.
covs_for_rubin <- meps1718 %>%
select(linps)
rubin_m1 <- bal.tab(match_1,
treat = meps1718$treat,
covs = covs_for_rubin,
data = meps1718, stats = c("m", "v"),
un = TRUE, disp.v.ratio = TRUE)[1]
rubin_report_m1 <- tibble(
status = c("Rule1", "Rule2"),
Unmatched = c(rubin_m1$Balance$Diff.Un,
rubin_m1$Balance$V.Ratio.Un),
Matched = c(rubin_m1$Balance$Diff.Adj,
rubin_m1$Balance$V.Ratio.Adj))
rubin_report_m1 %>% knitr::kable(digits = 2)
status | Unmatched | Matched |
---|---|---|
Rule1 | 1.55 | 0.10 |
Rule2 | 0.57 | 1.16 |
Explanation of Rubin’s Rules
Rule 1:
Evaluates the standardized differences of the linear PS expressed as proportions
Ideally, these results should be as close to 0 as possible, and definitely < 0.5 in absolute value (or 50%)
Rule 2
Evaluates the variance ratio of the linear propensity scores.
Ideally, these results should be within 0.8 to 1.25, but defiantly within 0.5 to 2
My Results
Rule 1: before matching we have a bias of 155.1308952%, and this is reduced to 9.5955115% after 1:1 greedy matching.
Rule 2: before matching we have a variance ratio of 56.6665221%, and this becomes 115.6520243% after 1:1 greedy matching.
We can see a considerable improvement, but I think we can do better
bal.plot
from cobalt
Age was my only quantitative variable. And we will see in all of the analyses that this was one of the most difficult to balance between groups. Here we can see the distributional balance for age before and after matching.
bal.plot(match_1,
treat = meps1718$RA,
covs = covs_1plus,
var.name = "age",
which = "both",
sample.names =
c("Unmatched Sample", "Matched Sample"))
We can graphically compare the distribution of PS in both groups with mirrored histograms, before and after matching
bal.plot(match_1,
treat = meps1718$RA,
covs = covs_1plus,
var.name = "ps",
which = "both",
sample.names =
c("Unmatched Sample", "Matched Sample"),
type = "histogram", mirror = TRUE)
After matching, the mirrored histogram looks the same! So cool!!
I am also evaluating some of the categorical variables that I am interested in based on previous some previous substantial imbalances
bal.plot(match_1,
treat = meps1718$RA,
covs = covs_1plus,
var.name = "phys_lim",
which = "both",
sample.names =
c("Unmatched Sample", "Matched Sample"))
bal.plot(match_1,
treat = meps1718$RA,
covs = covs_1plus,
var.name = "perc_health",
which = "both",
sample.names =
c("Unmatched Sample", "Matched Sample"))
It makes sense that perceived health and physical limitations would be so imbalanced before matching, given that people with RA have significantly reduced mobility. So that’s really awesome that we were able to correct this because these both are greatly associated with poor health outcomes.
love.plot
to look at Standardized DifferencesThe love plots below show the standardized differences before and matching for each covariate and the linear PS.
Although only one is necessary as they both describe the same thing, I like that I have the option of two different ways of visualizing the same message. The first love plot provides the cut offs for -10% to 10%, which the second one shows the absolute difference.
love.plot(bal1,
threshold = .1, size = 3,
var.order = "unadjusted",
stats = "mean.diffs",
stars = "raw",
sample.names = c("Unmatched", "Matched"),
title = "Love Plot for 1:1 Greedy Match") +
labs(caption = "* indicates raw mean differences (for binary variables)")
love.plot(bal1,
threshold = .1, size = 3,
var.order = "unadjusted",
stats = "mean.diffs",
stars = "raw",
abs = TRUE,
sample.names = c("Unmatched", "Matched"),
title = "Absolute Differences for 1:1 Greedy Match") +
labs(caption = "* indicates raw mean differences (for binary variables)")
The balance after matching is incredible. All of the covariates are within our target range. But as we saw before with Rubin’s Rule #1, that we just barely made it within the desired range for linear PS. I think we can do better. So I will try matching with a caliper next.
love.plot
to look at Variance RatiosHere we can see the variance ratios (Rubin Rule #2) before and after matching for the linear PS. we can also see it for my only quantitative variable, age (categorical variables are dropped). We were not within the desired range of 4/5 to 5/4 before and after matching for age (that was a tough variable), but we are for linps (Rubin #2)
love.plot(bal1,
threshold = 4/5, size = 3,
stats = "variance.ratios",
sample.names = c("Unmatched", "Matched"),
title = "Variance Ratios for our 1:1 Match")
adj.m1.pneumo <- clogit(pneumo ~ RA + strata(match1_matches), data=meps1718_matched1)
adj.m1_tidy <- tidy(adj.m1.pneumo, exponentiate = TRUE,
conf.int = TRUE)
adj.m1_tidy %>% kable(digits=2)
term | estimate | std.error | statistic | p.value | conf.low | conf.high |
---|---|---|---|---|---|---|
RA | 2.64 | 0.35 | 2.74 | 0.01 | 1.32 | 5.28 |
The odds of having pneumonia in RA individuals was 2.64 (95%CI: 1.32, 5.28) times higher than the odds that a non-RA control had pneumonia.
match_4
Caliper Matching (1:1 without replacement) with the Matching
packageI will use the Match
function to specify a caliper of 0.2 (from Matching
package)
Here, subjects will only be matched if they fall within the maximum distance of 0.2 standard deviations of the logit of the propensity score.
If they do not fall within this range, those subjects will be dropped.
The matching design will be 1:1 without replacement
match_4 <- Match(Tr = meps1718$treat, X = meps1718$linps,
M = 1, replace = FALSE, ties = FALSE,
caliper = 0.2, estimand = "ATT")
summary(match_4)
Estimate... 0
SE......... 0
T-stat..... NaN
p.val...... NA
Original number of observations.............. 2502
Original number of treated obs............... 516
Matched number of observations............... 478
Matched number of observations (unweighted). 478
Caliper (SDs)........................................ 0.2
Number of obs dropped by 'exact' or 'caliper' 38
Note that we have now dropped 38 of the RA=yes subjects, and reduced our sample to the 478 remaining RA=yes subjects, who are paired with 478 unique matched RA = no subjects in our matched sample.
Here I am storing the matched sample
match4_matches <- factor(rep(match_4$index.treated, 2))
meps1718_matched4 <- cbind(match4_matches,
meps1718[c(match_4$index.control,
match_4$index.treated),])
How many unique subjects are in our matched sample?
meps1718_matched4 %$% n_distinct(dupersid)
[1] 956
This match includes 478 pairs so 956 subjects, since we’ve done matching without replacement.
meps1718_matched4 %>% count(RA_f)
RA_f n
1 No 478
2 Yes 478
bal.tab
to obtain a balance tablecovs_4plus <- meps1718 %>%
select(panel, age, sex, race, marital, region, income, education, public_ins, prim_care, asthma, diabetes, cancer, bronchitis, emphysema, stroke, perc_health, mental, employment, IADL, ADL, smoke, phys_lim, cog_dif, highbp, chd, angina, MI, other_heart, high_chol, ps, linps)
bal4 <- bal.tab(match_4,
treat = meps1718$RA_f,
covs = covs_4plus, quick = FALSE,
data = meps1718, stats = c("m", "v"),
un = TRUE, disp.v.ratio = TRUE)
bal4
Balance Measures
Type Diff.Un V.Ratio.Un Diff.Adj V.Ratio.Adj
panel_21 Binary -0.0044 . 0.0126 .
panel_22 Binary 0.0070 . -0.0335 .
panel_23 Binary -0.0026 . 0.0209 .
age Contin. 0.6945 0.5928 -0.1337 0.8203
sex_F Binary 0.1793 . 0.0063 .
race_White Binary -0.0151 . -0.0439 .
race_Hispanic Binary -0.0021 . 0.0084 .
race_Black Binary 0.0633 . 0.0293 .
race_Other Binary -0.0461 . 0.0063 .
marital_married Binary -0.0406 . -0.0251 .
marital_never_married Binary -0.1182 . 0.0377 .
marital_divorced Binary 0.0535 . 0.0000 .
marital_widowed Binary 0.0778 . -0.0167 .
marital_separated Binary 0.0274 . 0.0042 .
region_South Binary 0.0798 . 0.0209 .
region_West Binary -0.0359 . -0.0063 .
region_Midwest Binary -0.0073 . -0.0084 .
region_Northeast Binary -0.0366 . -0.0063 .
income_Poor Binary 0.0908 . 0.0230 .
income_Near_poor Binary 0.0217 . -0.0042 .
income_Low Binary 0.0053 . -0.0146 .
income_Middle Binary -0.0268 . 0.0167 .
income_High Binary -0.0910 . -0.0209 .
education_no_degree Binary 0.1034 . 0.0021 .
education_GED Binary 0.0265 . 0.0063 .
education_HS Binary 0.0290 . -0.0021 .
education_BA Binary -0.0953 . -0.0209 .
education_graduate Binary -0.0360 . 0.0063 .
education_Other Binary -0.0276 . 0.0084 .
public_ins_No Binary -0.1988 . -0.0105 .
prim_care_No Binary -0.1459 . 0.0188 .
asthma_No Binary -0.0664 . -0.0230 .
diabetes_No Binary -0.0528 . -0.0063 .
cancer_No Binary -0.0686 . 0.0251 .
bronchitis_No Binary -0.0503 . 0.0000 .
emphysema_No Binary -0.0808 . -0.0230 .
stroke_No Binary -0.0560 . -0.0021 .
perc_health_VG_Exc Binary -0.3410 . -0.0146 .
perc_health_Good Binary 0.0668 . -0.0021 .
perc_health_Fair Binary 0.1597 . 0.0105 .
perc_health_Poor Binary 0.1145 . 0.0063 .
mental_Excellent Binary -0.1120 . -0.0167 .
mental_VG Binary -0.0789 . 0.0146 .
mental_Good Binary 0.0563 . -0.0084 .
mental_Fair Binary 0.1112 . 0.0146 .
mental_Poor Binary 0.0233 . -0.0042 .
employment_No Binary 0.2792 . -0.0293 .
IADL_No Binary -0.1215 . -0.0042 .
ADL_No Binary -0.0817 . -0.0063 .
smoke_yes Binary 0.0780 . 0.0418 .
[ reached 'max' / getOption("max.print") -- omitted 10 rows ]
Sample sizes
No Yes
All 1986 516
Matched 478 478
Unmatched 1508 0
Discarded 0 38
Below is a table of Rubin’s Rules 1 & 2 before and after our 1:1 caliper match is applied
covs_for_rubin <- meps1718 %>%
select(linps)
rubin_m4 <- bal.tab(match_4,
treat = meps1718$treat,
covs = covs_for_rubin,
data = meps1718, stats = c("m", "v"),
un = TRUE, disp.v.ratio = TRUE)[1]
rubin_report_m4 <- tibble(
status = c("Rule1", "Rule2"),
Unmatched = c(rubin_m4$Balance$Diff.Un,
rubin_m4$Balance$V.Ratio.Un),
Matched = c(rubin_m4$Balance$Diff.Adj,
rubin_m4$Balance$V.Ratio.Adj))
rubin_report_m4 %>% knitr::kable(digits = 3)
status | Unmatched | Matched |
---|---|---|
Rule1 | 1.551 | 0.004 |
Rule2 | 0.567 | 1.010 |
In summary, we can see that caliper matching produced an exceptionally strong match. Rubin’s Rule 1 is 0 and Rule 2 is essentially 1. However, this cost us 38 patients in the RA group who were the “hardest to match”, which is quite sad.
bal.plot
from cobalt
We can graphically compare the distribution of PS in both groups with mirrored histograms, before and after matching
bal.plot(match_4,
treat = meps1718$RA_f,
covs = covs_4plus,
var.name = "ps",
which = "both",
sample.names =
c("Unmatched Sample", "match_4 Sample"),
type = "histogram", mirror = TRUE)
After matching, the mirrored histogram looks the same! So cool!!
love.plot
to look at Standardized DifferencesThe love plot below helps us to look at the balance of each of the covariates.
We can see that we got all of the covariates in the desired range of -10% to +10%, except for age
. However its a whole lot better than it was before.
love.plot(bal4,
threshold = .1, size = 3,
var.order = "unadjusted",
stats = "mean.diffs",
stars = "raw",
sample.names = c("Unmatched", "Matched"),
title = "Love Plot for 1:1 Caliper Match (478 unique pairs)") +
labs(caption = "* indicates raw mean differences (for binary variables)")
love.plot
to look at Variance RatiosWe only had one continuous variable, age. We can see that the variance ratio of age. We can see that its even closer to 1 after matching.
Most importantly, we can visually see that we have met Rubin’s Rule #2 because the variance of the linear PS=1
love.plot(bal4,
threshold = .5, size = 3,
stats = "variance.ratios",
sample.names = c("Unmatched", "Matched"),
title = "Variance Ratios for our 1:1 Caliper Match")
From the matched sample, match4_matches
, I will perform a conditional logistic regression to estimate the exposure effect in terms of an odds ratio (by exponentiating the log odds ratio). NOTE: I am using the 0/1 version of the outcome (pneumo
) and exposure (RA
) indicator.
I will be using the function, clogit
, from the survival
package.
adj.m.pneumo <- clogit(pneumo ~ RA + strata(match4_matches), data=meps1718_matched4)
adj.m_tidy <- tidy(adj.m.pneumo, exponentiate = TRUE,
conf.int = TRUE)
adj.m_tidy %>% kable(digits=2)
term | estimate | std.error | statistic | p.value | conf.low | conf.high |
---|---|---|---|---|---|---|
RA | 1.85 | 0.34 | 1.78 | 0.08 | 0.94 | 3.63 |
The odds of having pneumonia in RA individuals was 1.85 (95%CI: 0.94, 3.63) times higher than the odds that a non-RA control had pneumonia.
I will be weighting by the inverse of the propensity score using the ATT approach.
meps1718$wts1 <- ifelse(meps1718$RA==1, 1, meps1718$ps/(1-meps1718$ps))
Here is a plot of the resulting ATT (average treatment effect on the treated) weights:
ggplot(meps1718, aes(x = ps, y = wts1, color = RA_f)) +
geom_point() +
guides(color = FALSE) +
facet_wrap(~ RA_f) +
labs(x = "Estimated Propensity for RA",
y = "ATT weights for the MEPS data",
title = "ATT weighting structure")
The exposed (RA) patients (right) all get a weight of 1, while the control patients, the higher their PS score is, the higher their weight. When controls have a very low PS for having RA, they have low weights.
I will use the twang package to assess balance after weighting. The twang package will calculate the effects on the weights.
The balanced table below shows me the RA mean (treated mean ) and control mean before and after weighting and a standardized difference, which will be very useful.
meps1718_df <- base::data.frame(meps1718)
covlist <- c('panel', 'age', 'sex', 'race', 'marital', 'region', 'income', 'education', 'public_ins', 'prim_care', 'asthma', 'diabetes', 'cancer', 'bronchitis', 'emphysema', 'stroke', 'perc_health', 'mental', 'employment', 'IADL', 'ADL', 'smoke', 'phys_lim', 'cog_dif', 'highbp', 'chd', 'angina', 'MI', 'other_heart', 'high_chol', 'ps', 'linps')
bal.wts1 <- dx.wts(x=meps1718_df$wts1, data=meps1718_df, vars=covlist, treat.var= 'RA', estimand="ATT")
bal.table(bal.wts1)
$unw
tx.mn tx.sd ct.mn ct.sd std.eff.sz stat p ks
panel:21 0.252 0.434 0.256 0.437 -0.010 0.041 0.960 0.004
panel:22 0.502 0.500 0.495 0.500 0.014 NA NA 0.007
panel:23 0.246 0.431 0.249 0.432 -0.006 NA NA 0.003
age 61.504 13.191 52.343 17.132 0.695 13.162 0.000 0.281
sex:M 0.258 0.437 0.437 0.496 -0.410 54.843 0.000 0.179
sex:F 0.742 0.437 0.563 0.496 0.410 NA NA 0.179
race:White 0.564 0.496 0.579 0.494 -0.030 7.061 0.000 0.015
race:Hispanic 0.182 0.386 0.184 0.388 -0.005 NA NA 0.002
race:Black 0.209 0.407 0.146 0.353 0.156 NA NA 0.063
race:Other 0.045 0.206 0.091 0.287 -0.223 NA NA 0.046
marital:married 0.496 0.500 0.537 0.499 -0.081 18.261 0.000 0.041
marital:never_married 0.099 0.298 0.217 0.412 -0.396 NA NA 0.118
marital:divorced 0.188 0.391 0.134 0.341 0.137 NA NA 0.054
marital:widowed 0.159 0.366 0.081 0.273 0.213 NA NA 0.078
marital:separated 0.058 0.234 0.031 0.173 0.117 NA NA 0.027
region:South 0.469 0.499 0.389 0.488 0.160 4.110 0.006 0.080
region:West 0.209 0.407 0.245 0.430 -0.088 NA NA 0.036
region:Midwest 0.198 0.398 0.205 0.404 -0.018 NA NA 0.007
region:Northeast 0.124 0.330 0.161 0.367 -0.111 NA NA 0.037
income:Poor 0.252 0.434 0.161 0.368 0.209 8.480 0.000 0.091
income:Near_poor 0.062 0.241 0.040 0.197 0.090 NA NA 0.022
income:Low 0.161 0.367 0.156 0.362 0.014 NA NA 0.005
income:Middle 0.246 0.431 0.273 0.445 -0.062 NA NA 0.027
income:High 0.279 0.449 0.370 0.483 -0.203 NA NA 0.091
education:no_degree 0.227 0.419 0.123 0.329 0.247 13.128 0.000 0.103
education:GED 0.068 0.251 0.041 0.199 0.106 NA NA 0.027
education:HS 0.442 0.497 0.413 0.492 0.058 NA NA 0.029
ks.pval
panel:21 0.960
panel:22 0.960
panel:23 0.960
age 0.000
sex:M 0.000
sex:F 0.000
race:White 0.000
race:Hispanic 0.000
race:Black 0.000
race:Other 0.000
marital:married 0.000
marital:never_married 0.000
marital:divorced 0.000
marital:widowed 0.000
marital:separated 0.000
region:South 0.006
region:West 0.006
region:Midwest 0.006
region:Northeast 0.006
income:Poor 0.000
income:Near_poor 0.000
income:Low 0.000
income:Middle 0.000
income:High 0.000
education:no_degree 0.000
education:GED 0.000
education:HS 0.000
[ reached 'max' / getOption("max.print") -- omitted 54 rows ]
[[2]]
tx.mn tx.sd ct.mn ct.sd std.eff.sz stat p ks
panel:21 0.252 0.434 0.247 0.431 0.011 0.020 0.980 0.005
panel:22 0.502 0.500 0.503 0.500 -0.002 NA NA 0.001
panel:23 0.246 0.431 0.250 0.433 -0.009 NA NA 0.004
age 61.504 13.191 62.825 14.427 -0.100 -1.619 0.106 0.090
sex:M 0.258 0.437 0.250 0.433 0.017 0.091 0.763 0.008
sex:F 0.742 0.437 0.750 0.433 -0.017 NA NA 0.008
race:White 0.564 0.496 0.575 0.494 -0.022 0.246 0.862 0.011
race:Hispanic 0.182 0.386 0.163 0.369 0.050 NA NA 0.019
race:Black 0.209 0.407 0.218 0.413 -0.022 NA NA 0.009
race:Other 0.045 0.206 0.044 0.205 0.003 NA NA 0.001
marital:married 0.496 0.500 0.481 0.500 0.031 0.312 0.864 0.016
marital:never_married 0.099 0.298 0.088 0.284 0.035 NA NA 0.010
marital:divorced 0.188 0.391 0.192 0.394 -0.010 NA NA 0.004
marital:widowed 0.159 0.366 0.183 0.387 -0.066 NA NA 0.024
marital:separated 0.058 0.234 0.056 0.230 0.009 NA NA 0.002
region:South 0.469 0.499 0.482 0.500 -0.026 0.107 0.955 0.013
region:West 0.209 0.407 0.212 0.409 -0.006 NA NA 0.003
region:Midwest 0.198 0.398 0.187 0.390 0.027 NA NA 0.011
region:Northeast 0.124 0.330 0.119 0.324 0.015 NA NA 0.005
income:Poor 0.252 0.434 0.250 0.433 0.004 0.193 0.940 0.002
income:Near_poor 0.062 0.241 0.072 0.259 -0.042 NA NA 0.010
income:Low 0.161 0.367 0.159 0.366 0.004 NA NA 0.002
income:Middle 0.246 0.431 0.257 0.437 -0.025 NA NA 0.011
income:High 0.279 0.449 0.261 0.439 0.039 NA NA 0.018
education:no_degree 0.227 0.419 0.228 0.419 -0.003 0.125 0.984 0.001
education:GED 0.068 0.251 0.077 0.266 -0.035 NA NA 0.009
education:HS 0.442 0.497 0.449 0.497 -0.015 NA NA 0.007
ks.pval
panel:21 0.980
panel:22 0.980
panel:23 0.980
age 0.030
sex:M 0.763
sex:F 0.763
race:White 0.862
race:Hispanic 0.862
race:Black 0.862
race:Other 0.862
marital:married 0.864
marital:never_married 0.864
marital:divorced 0.864
marital:widowed 0.864
marital:separated 0.864
region:South 0.955
region:West 0.955
region:Midwest 0.955
region:Northeast 0.955
income:Poor 0.940
income:Near_poor 0.940
income:Low 0.940
income:Middle 0.940
income:High 0.940
education:no_degree 0.984
education:GED 0.984
education:HS 0.984
[ reached 'max' / getOption("max.print") -- omitted 54 rows ]
The std.eff.sz
shows the standardized difference, but as a proportion, rather than as a percentage. Therefore, I will multiply them by 100 & plot the results.
bal.before.wts1 <- bal.table(bal.wts1)[1]
bal.after.wts1 <- bal.table(bal.wts1)[2]
balance.att.weights <- base::data.frame(names = rownames(bal.before.wts1$unw),
pre.weighting = 100*bal.before.wts1$unw$std.eff.sz,
ATT.weighted = 100*bal.after.wts1[[1]]$std.eff.sz)
balance.att.weights <- gather(balance.att.weights, timing, szd, 2:3)
Below is a plot of standardized differences before and after ATT weighting
ggplot(balance.att.weights, aes(x = szd, y = reorder(names, szd), color = timing)) +
geom_point(size = 3) +
geom_vline(xintercept = 0) +
geom_vline(xintercept = c(-10,10), linetype = "dashed", col = "blue") +
labs(x = "Standardized Difference", y = "",
title = "Standardized Difference before and after ATT Weighting",
subtitle = "N=2502")
This plot indicates that the weighting is doing a nice job. The red dots are all within the -10% to +10% range.
balance.att.weights %>% filter(names == "linps")
names timing szd
1 linps pre.weighting 155.1
2 linps ATT.weighted -11.5
The standardized difference before weighting was 155.1%. After weighting, the standardized difference was -11.5%. This is not within the -10% to 10% range, so we would not pass Rule 1 with flying colors.
meps1718_df <- base::data.frame(meps1718)
covlist_ps <- c('linps')
bal.wts2 <- dx.wts(x=meps1718_df$wts1, data=meps1718_df, vars=covlist_ps, treat.var= 'RA', estimand="ATT")
bal.table(bal.wts2)
$unw
tx.mn tx.sd ct.mn ct.sd std.eff.sz stat p ks ks.pval
linps -0.592 1.037 -2.2 1.377 1.551 29.195 0 0.517 0
[[2]]
tx.mn tx.sd ct.mn ct.sd std.eff.sz stat p ks ks.pval
linps -0.592 1.037 -0.473 1.186 -0.115 -1.631 0.103 0.11 0.004
treated sd: (1.037)^2=1.075369 control sd: (1.186)^2=1.406596
1.186*1.186
[1] 1.406596
((1.037)^2)/((1.186)^2)
[1] 0.7645187
The standard deviations of the linear PS after weighting of 1.037 in the RA group and 1.186 in the non-RA group. 1.037^2 / 1.186 ^2 = 0.7645, which is just outside our desired range of 4/5 to 5/4, however it is within the 1/2 to 2. Arguably, we can pass Rule 2.
# Recall that twang does not play well with tibbles,
ps.meps1718 <- ps(RA ~ panel + age + sex + race + marital + region + income + education + public_ins + prim_care + asthma + diabetes + cancer + bronchitis + emphysema + stroke + perc_health + mental + employment + IADL + ADL + smoke + phys_lim + cog_dif + highbp + chd + angina + MI + other_heart + high_chol,
data = meps1718_df,
n.trees = 3000,
interaction.depth = 2,
stop.method = c("es.mean"),
estimand = "ATT",
verbose = FALSE)
cobalt
bal.tab(ps.meps1718, full.stop.method = "es.mean.att")
Call
ps(formula = RA ~ panel + age + sex + race + marital + region +
income + education + public_ins + prim_care + asthma + diabetes +
cancer + bronchitis + emphysema + stroke + perc_health +
mental + employment + IADL + ADL + smoke + phys_lim + cog_dif +
highbp + chd + angina + MI + other_heart + high_chol, data = meps1718_df,
n.trees = 3000, interaction.depth = 2, verbose = FALSE, estimand = "ATT",
stop.method = c("es.mean"))
Balance Measures
Type Diff.Adj
prop.score Distance 0.2787
panel_21 Binary -0.0040
panel_22 Binary 0.0056
panel_23 Binary -0.0015
age Contin. 0.0084
sex_F Binary 0.0110
race_White Binary -0.0178
race_Hispanic Binary 0.0072
race_Black Binary 0.0147
race_Other Binary -0.0041
marital_married Binary 0.0156
marital_never_married Binary -0.0008
marital_divorced Binary 0.0002
marital_widowed Binary -0.0188
marital_separated Binary 0.0039
region_South Binary 0.0153
region_West Binary -0.0151
region_Midwest Binary 0.0003
region_Northeast Binary -0.0005
income_Poor Binary 0.0046
income_Near_poor Binary -0.0060
income_Low Binary -0.0111
income_Middle Binary -0.0029
income_High Binary 0.0155
education_no_degree Binary 0.0135
education_GED Binary 0.0011
education_HS Binary -0.0058
education_BA Binary -0.0069
education_graduate Binary 0.0001
education_Other Binary -0.0020
public_ins_No Binary -0.0194
prim_care_No Binary -0.0055
asthma_No Binary -0.0011
diabetes_No Binary 0.0010
cancer_No Binary 0.0011
bronchitis_No Binary -0.0008
emphysema_No Binary -0.0023
stroke_No Binary 0.0139
perc_health_VG_Exc Binary -0.0198
perc_health_Good Binary 0.0080
perc_health_Fair Binary 0.0037
perc_health_Poor Binary 0.0081
mental_Excellent Binary 0.0195
mental_VG Binary -0.0071
mental_Good Binary -0.0134
mental_Fair Binary 0.0084
mental_Poor Binary -0.0074
employment_No Binary 0.0173
IADL_No Binary 0.0009
ADL_No Binary -0.0065
smoke_yes Binary 0.0142
phys_lim_no Binary -0.0101
cog_dif_no Binary 0.0176
highbp_No Binary -0.0091
chd_No Binary -0.0073
angina_No Binary -0.0067
MI_No Binary -0.0157
other_heart_No Binary 0.0020
high_chol_No Binary 0.0124
Effective sample sizes
Control Treated
Unadjusted 1986. 516
Adjusted 640.52 516
We can see that the propensity score has improved, but not all the way down to where we would like it to be. (Rubin 1)
p <- love.plot(bal.tab(ps.meps1718),
threshold = .1, size = 3,
title = "Standardized Diffs and TWANG ATT weighting")
Warning: Standardized mean differences and raw mean differences are present in the same plot.
Use the 'stars' argument to distinguish between them and appropriately label the x-axis.
p + theme_bw()
We can see below that we have a variance ratio of the lin PS in the range we would like it to be.
p <- love.plot(bal.tab(ps.meps1718), stat = "v",
threshold = 1.25, size = 3,
title = "Variance Ratios: TWANG ATT weighting")
p + theme_bw()
I am using logistic regression with the weighed sample to get the adjusted odds of the outcome, pneumonia.
meps1718wt1.design <- svydesign(ids=~1, weights=~wts1, data=meps1718) # using ATT weights
adjpneumo.wt1 <- svyglm(pneumo ~ RA, design=meps1718wt1.design, family=quasibinomial())
wt_att_results <- tidy(adjpneumo.wt1, conf.int = TRUE, exponentiate = TRUE) %>%
filter(term == "RA") %>% select(-statistic, - p.value)
wt_att_results %>% kable(digits=3)
term | estimate | std.error | conf.low | conf.high |
---|---|---|---|---|
RA | 2.601 | 0.371 | 1.256 | 5.386 |
The odds of having pneumonia in RA individuals was 2.6 (95%CI: 1.26, 5.39) times higher than the odds that a non-RA control had pneumonia.
meps1718wt3.design <- svydesign(ids=~1,
weights=~get.weights(ps.meps1718,
stop.method = "es.mean"),
data=meps1718) # using twang ATT weights
adjout2.wt3 <- svyglm(pneumo ~ RA, design=meps1718wt3.design,
family=quasibinomial())
wt_twangatt_results2 <- tidy(adjout2.wt3, conf.int = TRUE, exponentiate = TRUE) %>%
filter(term == "RA") %>% select(-statistic, - p.value)
wt_twangatt_results2 %>% kable(dig=2)
term | estimate | std.error | conf.low | conf.high |
---|---|---|---|---|
RA | 3 | 0.34 | 1.54 | 5.87 |
The odds of having pneumonia in RA individuals was 3 (95%CI: 1.54, 5.87) times higher than the odds that a non-RA control had pneumonia.
I am using the double robust approach to estimate the odds of pneumonia in RA vs non-RA patients.
The logistic regression approach below uses the svydesign
and svyglm
functions from the survey
package. It is double robust because in addition to RA
, it also adds linps
dr.pneumo.wt1 <- svyglm(pneumo ~ RA + linps, design=meps1718wt1.design,
family=quasibinomial())
dr_att_pneumo <- tidy(dr.pneumo.wt1, exponentiate = TRUE, conf.int = TRUE) %>%
filter(term == "RA")
dr_att_pneumo %>% kable(digits=2)
term | estimate | std.error | statistic | p.value | conf.low | conf.high |
---|---|---|---|---|---|---|
RA | 2.88 | 0.37 | 2.86 | 0 | 1.39 | 5.93 |
The odds of having pneumonia in RA individuals was 2.88 (95%CI: 1.39, 5.93) times higher than the odds that a non-RA control had pneumonia.
twang
ATT weightsdr.out2.wt3 <- svyglm(pneumo ~ RA + linps, design=meps1718wt3.design,
family=quasibinomial())
dr_twangatt_out2 <- tidy(dr.out2.wt3, exponentiate = TRUE, conf.int = TRUE) %>%
filter(term == "RA")
dr_twangatt_out2
# A tibble: 1 x 7
term estimate std.error statistic p.value conf.low conf.high
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 RA 2.97 0.346 3.15 0.00167 1.51 5.84
Below are the OR and the associated 95% CI for each of the PS analyses
fullpo_tte <- tibble(
analysis = c("Unadjusted", "1:1 Greedy Matching", "1:1 Matching with Caliper",
"ATT Weighting by Inverse PS", "Twang ATT Weighting", "Doubly Robust"),
estimate = c(4.868, 2.64, 2.18, 2.60,3.00, 2.88),
conf.low = c(2.813, 1.32, 1.07, 1.26, 1.54, 1.39),
conf.high = c(8.502, 5.28, 4.45, 5.39, 5.87, 5.93))
ggplot(fullpo_tte, aes(x = analysis, y = estimate)) +
geom_errorbar(aes(ymax = conf.high, ymin = conf.low), width = 0.5) +
geom_label(aes(label = estimate), size = 5) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust=1)) +
labs(title = "Compariso of Odds Ratios and 95% Confidence Intervals (N=2502)",
subtitle = "1:1 Matching with a Caliper Preferred (Best Balance)",
x = "")
Of all of these methods, I prefer the 1:1 matching with a caliber because it produced the best covariate balance.
Although I already had the matched sample from the caliper match meps1718_matched4
above, I created another one meps1718_matched4_s
because I noticed that the code in the example included an arrange statement at the bottom and I’m not sure if we need that.
meps1718_matched4_s <-
cbind(match4_matches,
meps1718[c(match_4$index.control,
match_4$index.treated),]) %>%
arrange(match4_matches)
Below is the matched sample from the 1:1 caliper match without replacement
head(meps1718_matched4_s)
match4_matches dupersid RA RA_f pneumo pneumo_f totexp panel age sex race
1 1987 94379104 0 No 0 No 21156 22 85 F Other
2 1987 10014101 1 Yes 0 No 2000 21 71 F White
3 1988 91142101 0 No 0 No 2400 22 71 F White
4 1988 10404101 1 Yes 0 No 5295 21 77 F Black
5 1989 17587101 0 No 0 No 5363 21 78 F White
marital region income education public_ins prim_care asthma diabetes
1 widowed South Near_poor no_degree Yes Yes No No
2 divorced Northeast High graduate No Yes No No
3 married South High HS Yes Yes No No
4 married South High BA No Yes No No
5 married Midwest Middle HS No No No No
cancer bronchitis emphysema stroke perc_health mental employment IADL ADL
1 No No No No Good Good No Yes Yes
2 No No No No Fair Fair Yes No No
3 No No No No Fair Fair Yes No No
4 Yes No No No Good VG No No No
5 Yes No No No Fair Good No Yes Yes
smoke phys_lim cog_dif highbp chd angina MI other_heart high_chol arthritis
1 no yes no Yes No No No No No No
2 no yes no No No No No No Yes Yes
3 no no no No No No No No No No
4 no no no Yes No No No No No Yes
5 no yes yes Yes No Yes No No Yes Yes
treat ps linps wts1
1 FALSE 0.5737117 0.29701130 1.3458305
2 TRUE 0.5725266 0.29216693 1.0000000
3 FALSE 0.5121045 0.04842762 1.0496194
4 TRUE 0.5135560 0.05423738 1.0000000
5 FALSE 0.4230939 -0.31008533 0.7333844
[ reached 'max' / getOption("max.print") -- omitted 1 rows ]
This 2 x 2 table that I am making from the matched sample will be plugged into the spreadsheet to get an exact value of gamma
tmp <- meps1718_matched4_s %>%
mutate(res = 10*RA + pneumo_f) %>%
group_by(match4_matches) %>%
summarize(out.treated = pneumo_f[2],
out.control = pneumo_f[1])
Warning: Problem with `mutate()` input `res`.
ℹ '+' not meaningful for factors
ℹ Input `res` is `10 * RA + pneumo_f`.
Warning in Ops.factor(10 * RA, pneumo_f): '+' not meaningful for factors
`summarise()` ungrouping output (override with `.groups` argument)
tmp %>% tabyl(out.control, out.treated) %>% adorn_title()
out.treated
out.control No Yes
No 441 24
Yes 13 0
Gamma_results_from_Excel
The gamma obtained was 5.426. This gamma indicates a highly insensitive study (extremely far from 1)
Ways of interpreting:
The result was not sensitive to an unmeasured binary covariate which led to a 5.43 fold increase in the odds of exposure to RA and was a perfect predictor of pneumonia.
To explain the association in this particular study, one would need a hidden bias of 5.43
To attribute the observed significant outcome to an unobserved covariate rather than to RA, that observed covariate has to increase the odds of being in the RA group by a factor of 5.43 and also predict pneumonia quite well.
binarysens
The code below didn’t work, which is why I had to do it on the spreadsheet. The error stated: “Error in if (y.tmp1 >= y.tmp2) { : missing value where TRUE/FALSE needed”
# binarysens(match_4, Gamma = 2.5, GammaInc = 0.25)
binarysens
# binarysens(m.obj, Gamma = 1.75, GammaInc = 0.05)$bounds %>%
# tibble() %>% slice(11:17)
In conclusion, the results from my all of my analyses indicate that the burden of pneumonia among a nationally representative sample of adults (N=2500), is greater among people treated for rheumatoid arthritis. The PS analyses used to balance the groups were 1:1 greedy matching, 1:1 matching with a caliper without replacement, weighting with the inverse propensity score, and twang weighting.
As seen in the plot below, the adjusted ORs range from 2.18 (95% CI 1.07, 4.45) (1:1 match with a caliper) to 3.00 (1.54 to 5.87) (Twang ATT weighting), which are a meaningful drop from the unadjusted OR of 4.87 (95% CI 2.81, 8.50). This great drop is due to the substantial selection bias in the unadjusted analysis, which can be seen by the lack of overlap in the linear PS (Rubin Rule 1= 1.551, Rubin Rule 2=0.567).
fullpo_tte <- tibble(
analysis = c("Unadjusted", "1:1 Greedy Matching", "1:1 Matching with Caliper",
"ATT Weighting by Inverse PS", "Twang ATT Weighting", "Doubly Robust"),
estimate = c(4.868, 2.64, 2.18, 2.60,3.00, 2.88),
conf.low = c(2.813, 1.32, 1.07, 1.26, 1.54, 1.39),
conf.high = c(8.502, 5.28, 4.45, 5.39, 5.87, 5.93))
ggplot(fullpo_tte, aes(x = analysis, y = estimate)) +
geom_errorbar(aes(ymax = conf.high, ymin = conf.low), width = 0.5) +
geom_label(aes(label = estimate), size = 5) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust=1)) +
labs(title = "Compariso of Odds Ratios and 95% Confidence Intervals (N=2502)",
subtitle = "1:1 Matching with a Caliper Preferred (Best Balance)",
x = "")
Of all of the PS analyses, I preferred the 1:1 matching with a caliper, which also produced the lowest OR and tightest CI. This was the best method for balancing the groups because it almost perfectly eliminated the observed covariate imbalance between the groups, indicated by Rubin’s Rules 1 and 2 of 0.004 and 1.010, respectively. The improvement in overlap can also visually be seen in the figure below.
p1 <- ggplot(meps1718, aes(x = linps, fill = RA_f)) +
geom_density(alpha = 0.3)
p2 <- ggplot(meps1718_matched4, aes(x = linps, fill = RA_f)) +
geom_density(alpha = 0.3)
p1 + p2
The sensitivity analysis using the matched sample from my preferred 1:1 caliper match produced a gamma of 5.43. This extraordinarily high gamma, indicating a highly insensitive study, should be interpreted with caution. Using the small control sample size (a random subsample of 1986 adults from the available 33,010), there were no matched pairs that experienced the outcome, pneumonia. If all of the controls were used from the original MEPS sample, this gamma (and my results) would be meaningfully different. Thus, my next step will be to include all of the controls and then only filter those with a PS below the minimum PS in the RA group. I will then use the new subset of controls to fit a new PS.
Although this analysis is not yet finished, the relationship observed is consistent with the results seen in a previous case control study, the high number of infections observed in RA drug clinical trials, the pathophysiology of the disease (the lungs are one of the most common sites for extra articular disease), and the mechanism of action of the drugs. These results may help contribute to the scientific community in terms of public health planning, such as ensuring that RA patients are up to date on pneumonia, COVID-19, and influenza immunizations.
This project was the most valuable piece of work that I have done this year. First, I learned how to use the MEPS database for research. This was by far was one of the most difficult parts of the project, as there were many files to navigate with a profound amount of information, intricacies of the study design that were necessary to understand the variables and which were appropriate, and certain data collection limitations that determined what analyses I could complete. The MEPS workshops were extremely helpful, and all of the instructors have given me fast and detailed responses to my multitude of emails. I have gained such an appreciation for this rich database and I cannot believe it is free to the public. I am incredibly grateful that I this class gave me the opportunity to use it.
Second, I learned about what should be included in a propensity score. I thought after reading Rosenbaum, Rubin’s essay, and taking notes in class that I understood and it was simple. However, my original covariate list was less than 15 covariates. As the project went on and I was learning more and more about the database and reading other papers/listening to OSIA presentations, I finally understood that our goal is to pick up as much signal as possible and make the groups as equal as possible. For example, at first, I was very meticulous about including which comorbidities were shown in the literature to potentially be related to my exposure RA, or my outcome, pneumonia. So, I wasn’t including comorbidities like hypertension or hyperlipidemia as these have not been shown to be associated with RA or be risk factors for pneumonia (although other cardiovascular comorbidities are such as CAD). However, after Dr. Love said that one of his papers had 90 something covariates, and I was getting a better feel for propensity scores, I realized it didn’t matter that factors like hypertension, hyperlipidemia, or being a widow weren’t related to the exposure or outcome, what matters is that these groups are as similar as we can possibly make them so we can remove some hidden bias. When I would go for runs in the morning, I would start thinking about all of the other covariates I could add (MEPS is the gift that keeps on giving with covariates) because there is going to be all sorts of hidden bias in a non-randomized trial, but we can help eliminate some of that if we balance people on any available baseline characteristics. I stress baseline, because as I got up to 40 something covariates, that was also a mistake. I was including covariates that were measured at the end of the interview period, such as expenditure, which is incorrect. Not only was this wrong, when I did this, my OR crossed 1 (but also that was back when I was including all RA patients regardless of rx filled). My huge lesson learned is that at the start of the project, select all of the baseline characteristics that are available. Then, keep them at their most granular form. Only collapse them if there are less than 20 people in a particular group for a certain level. Then, evaluate missingness. If there is more than like 25% missing, just drop it (I only had to do this for flu vaccine which had 50% missing). Lastly, at the very end of the project, I learned I need to add weights into the propensity scores. This is another next step.
Third, I learned how many different methods for weighting and matching there are. Before the project, I knew (from class) about 1:k matching with and without replacement and Twang and inverse PS weighting. I was familiar with the concept of caliper, but I didn’t really understand it after learning about it in class. This project allowed me to practice matching with a caliper and see its advantages (super strong balance) and disadvantages (dropping hard to match people from the exposed).
Fourth, I learned a lot about data cleaning in R and making work more reproducible. At first, I was cleaning in both R and SAS. Dr. Love explained to me that it was fine if I chose to do that, but that I need to be able to go back a few years from now and know exactly what I did. By having 4 different documents in different coding languages that were so disorganized, there was no way that I would be able to reproduce what I did. Now I am down to two documents and they are both in R and much better organized.
Lastly, I also learned a lot from other people’s presentations. I liked how Stephanie compared her results from all of the matches and used a plot with 95% CI to do so. She was kind enough to provide me with the code, which I was so happy to include in my project. Although this might seem like something really simple, I know I will be able to use that code in the future. Amr’s presentation using a matching method that matched up to 2 controls for each exposed was a great learning experience for me. I recently read a JAMA paper that used HCUP-NIS data for PS matching and I thought that it was really strong that they did up to two matches; like that their approach was superior to mine because they could have more controls. However, as you explained, it is misleading as they really do not have two matches for every treated. And if you want something similar, to just do weighting. Now that I have learned that weighting is actually better than that method, I will not try to emulate that approach. Finally, while I was presenting, I learned about what happens when the PS is really close to 0 or 1. I learned that you should go back and find out which covariates might be the culprit of this. But furthermore, I learned that I can solve that when I include more controls and remove subjects who do not have PS close to the lower end of the PS range of my exposed group.
To complete this project to my satisfaction I will: