library(janitor)
library(knitr)
library(broom)
library(magrittr)
library(patchwork)
library(ggrepel)
library(viridis)
library(tidyverse)
The tibble below shows the data ingested from the County Health Rankings website
data_url <- "https://www.countyhealthrankings.org/sites/default/files/media/document/analytic_data2020_0.csv"
chr_2020_raw <- read_csv(data_url, skip = 1, guess_max = 4000)
chr_2020_raw
# A tibble: 3,194 x 786
statecode countycode fipscode state county year county_ranked v001_rawvalue
<chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
1 00 000 00000 US Unite… 2020 NA 6940.
2 01 000 01000 AL Alaba… 2020 NA 9943.
3 01 001 01001 AL Autau… 2020 1 8129.
4 01 003 01003 AL Baldw… 2020 1 7354.
5 01 005 01005 AL Barbo… 2020 1 10254.
6 01 007 01007 AL Bibb … 2020 1 11978.
7 01 009 01009 AL Bloun… 2020 1 11335.
8 01 011 01011 AL Bullo… 2020 1 11680.
9 01 013 01013 AL Butle… 2020 1 14360.
10 01 015 01015 AL Calho… 2020 1 12079.
# … with 3,184 more rows, and 778 more variables: v001_numerator <dbl>,
# v001_denominator <dbl>, v001_cilow <dbl>, v001_cihigh <dbl>,
# v001_flag <dbl>, v001_race_aian <dbl>, v001_race_aian_cilow <dbl>,
# v001_race_aian_cihigh <dbl>, v001_race_aian_flag <dbl>,
# v001_race_asian <dbl>, v001_race_asian_cilow <dbl>,
# v001_race_asian_cihigh <dbl>, v001_race_asian_flag <dbl>,
# v001_race_black <dbl>, v001_race_black_cilow <dbl>,
# v001_race_black_cihigh <dbl>, v001_race_black_flag <dbl>,
# v001_race_hispanic <dbl>, v001_race_hispanic_cilow <dbl>,
# v001_race_hispanic_cihigh <dbl>, v001_race_hispanic_flag <dbl>,
# v001_race_white <dbl>, v001_race_white_cilow <dbl>,
# v001_race_white_cihigh <dbl>, v001_race_white_flag <dbl>,
# v002_rawvalue <dbl>, v002_numerator <lgl>, v002_denominator <lgl>,
# v002_cilow <dbl>, v002_cihigh <dbl>, v036_rawvalue <dbl>,
# v036_numerator <lgl>, v036_denominator <lgl>, v036_cilow <dbl>,
# v036_cihigh <dbl>, v042_rawvalue <dbl>, v042_numerator <lgl>,
# v042_denominator <lgl>, v042_cilow <dbl>, v042_cihigh <dbl>,
# v037_rawvalue <dbl>, v037_numerator <dbl>, v037_denominator <dbl>,
# v037_cilow <dbl>, v037_cihigh <dbl>, v037_flag <dbl>, v037_race_aian <dbl>,
# v037_race_aian_cilow <dbl>, v037_race_aian_cihigh <dbl>,
# v037_race_asian <dbl>, v037_race_asian_cilow <dbl>,
# v037_race_asian_cihigh <dbl>, v037_race_black <dbl>,
# v037_race_black_cilow <dbl>, v037_race_black_cihigh <dbl>,
# v037_race_hispanic <dbl>, v037_race_hispanic_cilow <dbl>,
# v037_race_hispanic_cihigh <dbl>, v037_race_white <dbl>,
# v037_race_white_cilow <dbl>, v037_race_white_cihigh <dbl>,
# v009_rawvalue <dbl>, v009_numerator <lgl>, v009_denominator <lgl>,
# v009_cilow <dbl>, v009_cihigh <dbl>, v011_rawvalue <dbl>,
# v011_numerator <dbl>, v011_denominator <lgl>, v011_cilow <dbl>,
# v011_cihigh <dbl>, v133_rawvalue <dbl>, v133_numerator <dbl>,
# v133_denominator <dbl>, v133_cilow <lgl>, v133_cihigh <lgl>,
# v070_rawvalue <dbl>, v070_numerator <dbl>, v070_denominator <lgl>,
# v070_cilow <dbl>, v070_cihigh <dbl>, v132_rawvalue <dbl>,
# v132_numerator <dbl>, v132_denominator <dbl>, v132_cilow <lgl>,
# v132_cihigh <lgl>, v049_rawvalue <dbl>, v049_numerator <lgl>,
# v049_denominator <lgl>, v049_cilow <dbl>, v049_cihigh <dbl>,
# v134_rawvalue <dbl>, v134_numerator <dbl>, v134_denominator <dbl>,
# v134_cilow <dbl>, v134_cihigh <dbl>, v045_rawvalue <dbl>,
# v045_numerator <dbl>, v045_denominator <dbl>, v045_cilow <lgl>, …
I will be selecting data from five states which include: Ohio, Minnesota, California, New York and Washington. I selected the non-required states based on their national ranking of premature death. California, Minnesota, New York, and Washington rank 1, 2, 3, and 5 for premature death rates in the U.S.
I have selected five variables: premature death, income inequality, sexually transmitted infections, high school graduation, and % not proficient in English. These variables were coded in the raw 2020 County Health Rankings Data as v001, v044, v045, v021, and v059, respectively.
chr_2020 <- chr_2020_raw %>%
filter(county_ranked == 1) %>%
filter(state %in% c("OH", "CA", "MN", "NY", "WA")) %>%
select(state,county, fipscode, v001_rawvalue,v044_rawvalue, v045_rawvalue, v021_rawvalue, v059_rawvalue ) %>%
rename(YPLL=v001_rawvalue,
income=v044_rawvalue,
std=v045_rawvalue,
hs_grad=v021_rawvalue,
not_english_prof=v059_rawvalue )
chr_2020
# A tibble: 334 x 8
state county fipscode YPLL income std hs_grad not_english_prof
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 CA Alameda County 06001 4409. 5.21 553. 0.868 0.0844
2 CA Alpine County 06003 32794. 4.19 804. NA 0.00542
3 CA Amador County 06005 6364. 4.25 171. 0.885 0.0110
4 CA Butte County 06007 7784. 5.57 527. 0.845 0.0250
5 CA Calaveras County 06009 7346. 4.64 166. 0.897 0.00725
6 CA Colusa County 06011 6556. 4.22 225. 0.893 0.160
7 CA Contra Costa Cou… 06013 4624. 4.74 500. 0.879 0.0648
8 CA Del Norte County 06015 8955. 5.07 364 0.810 0.0168
9 CA El Dorado County 06017 5165. 4.87 238. 0.894 0.0173
10 CA Fresno County 06019 7007. 5.13 728. 0.815 0.120
# … with 324 more rows
To improve interpretation, the years of potential life lost, YPLL
, was divided by 100 to represent losses per 1000 population
chr2020<-chr_2020$YPLL <- chr_2020$YPLL / 100
Variables that were proportions were converted to percentages. These variables were high school graduation,hs_grad
, and proportion not English proficient, not_english_prof
.
chr2020<-chr_2020$hs_grad <- chr_2020$hs_grad * 100
ch2020<-chr_2020$not_english_prof <- chr_2020$not_english_prof * 100
The table below lists detailed numerical summaries for hs_grad
, not_english_prof
, and YPLL
. This shows that:
We have successfully converted the two proportion variables (hs_grad
and not_english_prof
) to percentages (values between 0 and 100).
YPLL
has been successfully been converted to losses per 1000 people (if you compare the ranges of values seen below to the raw tibble in section 1.2)
psych::describe(chr_2020 %>% select(hs_grad, not_english_prof, YPLL)) %>% kable(digits=2)
vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
hs_grad | 1 | 333 | 86.63 | 7.56 | 87.50 | 87.32 | 6.24 | 36.30 | 100.00 | 63.70 | -2.24 | 10.71 | 0.41 |
not_english_prof | 2 | 334 | 2.40 | 3.67 | 0.78 | 1.50 | 0.81 | 0.00 | 20.75 | 20.75 | 2.37 | 5.49 | 0.20 |
YPLL | 3 | 334 | 70.53 | 24.70 | 67.95 | 68.35 | 19.62 | 32.07 | 327.94 | 295.88 | 3.87 | 34.67 | 1.35 |
fipscode
and factoring the state
I have used the following codes below to make fipscode
a character variable and state
a factor variable. I then leveled the states in order of rank of premature death rates.
chr_2020 <- chr_2020 %>%
mutate(fipscode = str_pad(fipscode, 5, pad = "0"))
chr_2020 <- chr_2020 %>%
mutate(state=factor(state, levels = c("OH","CA", "MN", "NY", "WA")))
Given the five states I selected, I should have 334 rows, since there are 334 counties across those states, and I should have 8 variables. The glimpse function below confirms this. We can see there are 334 rows and 8 columns. Of note, we can also confirm that state is a factor variable and fipscode is a character variable now.
glimpse(chr_2020)
Rows: 334
Columns: 8
$ state <fct> CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, …
$ county <chr> "Alameda County", "Alpine County", "Amador County", …
$ fipscode <chr> "06001", "06003", "06005", "06007", "06009", "06011"…
$ YPLL <dbl> 44.08591, 327.94178, 63.63866, 77.84496, 73.45886, 6…
$ income <dbl> 5.207727, 4.193746, 4.249300, 5.574450, 4.638414, 4.…
$ std <dbl> 553.2, 803.6, 170.9, 527.3, 166.4, 224.7, 499.5, 364…
$ hs_grad <dbl> 86.81413, NA, 88.48315, 84.51454, 89.74895, 89.34426…
$ not_english_prof <dbl> 8.4416315, 0.5420054, 1.0993250, 2.4996271, 0.725190…
The code below shows each of the states has the anticipated number of counties.
chr_2020 %>% tabyl(state) %>% adorn_pct_formatting()
state n percent
OH 88 26.3%
CA 58 17.4%
MN 87 26.0%
NY 62 18.6%
WA 39 11.7%
Below I have split not_english_prof
(% of a county not proficient in English) into two categories, which was based on above and below 10% of the county not being proficient in English. I decided the cut off of 10% by evaluating the spread on a histogram and then doing a count of how many counties met this 10% cut off.
ggplot(data = chr_2020, aes(x = not_english_prof)) + geom_histogram(bins=20) +
labs (title="Spread of % not proficient in English across 334 counties", x="% of a county not proficient in English")
chr_2020 %>%
count(not_english_prof>10)
# A tibble: 2 x 2
`not_english_prof > 10` n
<lgl> <int>
1 FALSE 313
2 TRUE 21
chr_2020 <- chr_2020 %>%
mutate(not_english_prof_10pct = case_when(
not_english_prof > 10 ~ "above10percent",
TRUE ~ "below10percent"),
not_english_prof_10pct = factor(not_english_prof_10pct))
mosaic::favstats(not_english_prof ~ not_english_prof_10pct, data = chr_2020) %>%
kable(digits = 1)
not_english_prof_10pct | min | Q1 | median | Q3 | max | mean | sd | n | missing |
---|---|---|---|---|---|---|---|---|---|
above10percent | 10.2 | 11.2 | 13.2 | 15.0 | 20.7 | 13.6 | 2.9 | 21 | 0 |
below10percent | 0.0 | 0.4 | 0.7 | 1.9 | 10.0 | 1.6 | 2.2 | 313 | 0 |
Below we can see we have 9 columns because I have created a new variable not_english_prof_10pct
. Furthermore, there are still 334 rows, corresponding to 334 counties.
names(chr_2020)
[1] "state" "county" "fipscode"
[4] "YPLL" "income" "std"
[7] "hs_grad" "not_english_prof" "not_english_prof_10pct"
nrow(chr_2020)
[1] 334
In this section I will be creating a multicategorical variable from my std
variable
I wanted to create a categorical variable of equal size for my 334 observations on sexually transmitted infections (number of newly diagnosed chlamydia cases per 100,000 population). I used the cut2 function in the Hmisc package to evenly divide this variable into three groups. I then re-coded these three cut points with low, middle, and high to represent where the chlamydia prevalence was <250, 250 to 365, 365 to 1204 cases per 100,000 population, respectively.
chr_2020 <- chr_2020 %>%
mutate(std_cut3 = factor(Hmisc::cut2(std, g = 3)))
mosaic::favstats(std ~ std_cut3, data = chr_2020) %>%
kable(digits = 3)
std_cut3 | min | Q1 | median | Q3 | max | mean | sd | n | missing |
---|---|---|---|---|---|---|---|---|---|
[ 39.9, 250) | 39.9 | 158.25 | 188.60 | 219.150 | 248.3 | 183.468 | 44.933 | 111 | 0 |
[249.6, 365) | 249.6 | 276.25 | 297.00 | 332.600 | 364.0 | 304.756 | 34.267 | 110 | 0 |
[365.0,1204] | 365.0 | 403.65 | 478.55 | 605.125 | 1203.9 | 527.796 | 159.953 | 110 | 0 |
chr_2020 <- chr_2020 %>%
mutate(std_cut3 = fct_recode(std_cut3,
low = "[ 39.9, 250)",
middle = "[249.6, 365)",
high = "[365.0,1204]")) %>%
mutate(std_cut3 = fct_relevel(std_cut3,
"low", "middle", "high"))
mosaic::favstats(std ~ std_cut3, data = chr_2020) %>%
kable(digits = 3)
std_cut3 | min | Q1 | median | Q3 | max | mean | sd | n | missing |
---|---|---|---|---|---|---|---|---|---|
low | 39.9 | 158.25 | 188.60 | 219.150 | 248.3 | 183.468 | 44.933 | 111 | 0 |
middle | 249.6 | 276.25 | 297.00 | 332.600 | 364.0 | 304.756 | 34.267 | 110 | 0 |
high | 365.0 | 403.65 | 478.55 | 605.125 | 1203.9 | 527.796 | 159.953 | 110 | 0 |
Next I will check the structure of my tibble. I am checking that:
the initial row tells me that this is a tibble and specifies its dimensions (334 counties x 10 variables)
I still have the complete set of 334 rows (counties)
I have included 10 variables:
2 character variables: fipscode
and county
3 factor variables:
state
(5 levels)
not_english_prof_10pct
(2 levels: above10percent, below10percent)
std_cut3
(3 levels: low, medium, high)
5 numerical variables: YPLL
, income
, std
, hs_grad
, not_english_prof
hs_grad
and not_english_prof
are percentages (values between 0 and 100).str(chr_2020)
tibble [334 × 10] (S3: tbl_df/tbl/data.frame)
$ state : Factor w/ 5 levels "OH","CA","MN",..: 2 2 2 2 2 2 2 2 2 2 ...
$ county : chr [1:334] "Alameda County" "Alpine County" "Amador County" "Butte County" ...
$ fipscode : chr [1:334] "06001" "06003" "06005" "06007" ...
$ YPLL : num [1:334] 44.1 327.9 63.6 77.8 73.5 ...
$ income : num [1:334] 5.21 4.19 4.25 5.57 4.64 ...
$ std : num [1:334] 553 804 171 527 166 ...
$ hs_grad : num [1:334] 86.8 NA 88.5 84.5 89.7 ...
$ not_english_prof : num [1:334] 8.442 0.542 1.099 2.5 0.725 ...
$ not_english_prof_10pct: Factor w/ 2 levels "above10percent",..: 2 2 2 2 2 1 2 2 2 1 ...
$ std_cut3 : Factor w/ 3 levels "low","middle",..: 3 3 1 3 1 1 3 2 1 3 ...
This is a table listing all 10 variables that I am evaluating with their original code from the raw file. Critical information was drawn from the County Health Ranking web site
Variable | Description |
---|---|
fipscode | FIPS code |
state | State: my five states are OH, CA, MN, NY, WA |
county | County Name |
YPLL | (v001) Premature Death (Years of Potential Life Lost), which will be my outcome |
income | (v044) Income Inequality |
hs_grad | (v021) High School Graduation Rate |
std | (v045) Sexually Transmitted Infections |
not_english_prof | (v059) % Not Proficient in English |
not_english_prof_10pct | 2 levels: above10percent = % not english proficient above 10%, or below10percent = % not english proficient below 10% |
std_cut3 | 3 levels: low = chlamydia prevalence below 250 cases per 100,000 population, middle=chlamydia prevalence between 250-365 cases per 100,000 population, high=chlamydia prevalence above 365 cases per 100,000 population |
More details on my five original variables are specified below. These results are summarized versions of the summaries linked on the County Health Rankings site.
YPLL
was originally variable v001_rawvalue
, and is listed in the Length of Life subcategory under Health Outcomes at County Health Rankings. It describes Premature Death, also referred to as the Years of Potential Life Lost (YPLL) before age 75 per 1000 population. This is an age adjusted value (to the year 2000 US population) because age is a non-modifiable risk factor and as age increases, poor health outcomes are more likely. The numerator is the number of years of potential life lost for deaths that occurred among people who reside in a county under the age 75 for 3 years. The denominator is the sum of the population younger than 75 years old. This is an important measure because it reflects the County Health Rankings’ intent to focus on deaths that could have been prevented. The 2020 County Health Rankings used data from National Center for Health Statistics - Mortality Files from 2016-2018 for this measure. Lastly, a major limitation to this variable is that it is difficult to interpret, even for clinicians. This will be my outcome variable.
income
was originally variable v044_rawvalue
, and is listed in the Social & Economic Factors subcategory under Social & Economic Factors at County Health Rankings. It describes the ratio of household income at the 80th percentile to income at the 20th percentile, according to American Community Survey, 5-year estimates of the county’s population. A higher inequality ratio indicates greater division between the top and bottom ends of the income spectrum. It is based on data from American Community Survey, 5-year estimates from 2014-18. This is an important variable because it can have broad health impacts, including increased risk of mortality, which coincides with my outcome variable.
std
was originally variable v045_rawvalue
, and is listed in the Sexual Activity subcategory under Health Behaviors at County Health Rankings. It describes the number of newly diagnosed chlamydia cases per 100,000 population. It is based on data from the National Center for HIV/AIDS, Viral Hepatitis, STD, and TB Prevention from 2016. An important limitation to this data is that increases in cases reported may be due to increased screening rates and improved sensitivity of tests. Most importantly, areas with low resources may have low screening rates, and therefore, the incidence may be falsely low. This is an important variable because sexually transmitted infections are associated with a significantly increased risk of morbidity and mortality. Of note, this variable is also a measure of health equality because Chlamydia disproportionately affects low resource communities.
hs_grad
was originally variable v021_rawvalue
, and is listed in the Education subcategory under Social & Economic Factors at County Health Rankings. It describes the percent of the county’s ninth grade cohort that graduates with a high school diploma in four years, and is based on EDFacts data from 2016-17. There should not be comparisons across state lines because states define their data differently.
not_english_prof
was originally variable v059_rawvalue
, and is listed under Demographics at County Health Rankings. It describes the % of the population not proficient in English according to American Community Survey, 5-year estimates from 2014-2018.
The five states I chose were CA, MN, NY, OH and WA. The numbers of counties in each of theses states are 58, 87, 62, 88, and 39, respectively. The total number of counties I will be studying are 334. These values can be seen in the table below. My motivation for choosing CA, MN, NY, and WA was that they rank 1, 2, 3, and 5 for premature death, respectively.
chr_2020 %>% group_by(state) %>%
count() %>%
kable()
state | n |
---|---|
OH | 88 |
CA | 58 |
MN | 87 |
NY | 62 |
WA | 39 |
The five variables (raw value, renamed) I chose were:
v001
, YPLL
).This variable is interesting to me because it greatly highlights inequality and is a method to measure progress in public health measures over time. There are various limitations to this measure, most importantly, that clinicians even have difficulty interpreting it. However, despite its difficult interpretation, YPLL truly emphasizes deaths of younger persons, which clearly should have been prevented. I am passionate about this because there is no reason why people should have a shorter lifespan due to inequality. I think this serves as a great base for my remaining predictor variables which are fundamental predictors of inequality in health care.
v044
, income
)I chose this variable because income inequality can be associated with barriers in access to care and unhealthy lifestyles. Furthermore, I am interested in how socioeconomic factors are associated with health outcomes. I believe this disparity to have a strong link to my outcome variable, years of potential life lost.
v045
, std
)I chose this variable for two reasons: (1) I read that infection rates are associated with premature mortality (2) This is a marker for risky behavior. Populations with a tendency towards risky behaviors may have an interaction with premature mortality.
This was the variable that I split into three groups: ‘low’, ‘medium’ and ‘high’. The method for splitting them into 3 groups ensured that each group was of equal size. I thought this would be a good group to split into categories because it would make it easier to interpret than if it was continuous. Now, we can simply look at the analysis and evaluate where chlamydia incidence is low, medium, or high. Furthermore, I think it will be interesting to facet plots by this variable.
v021
, hs_grad
)I chose this variable because I used to teach high school at an at-risk school where the majority of the students had free or reduced price lunch. I have witnessed the great inequality that stems from not having an education. Not only am I just interested in evaluating high school graduation rates, I am interested in the interaction between life years lost and how education might be associated with it. Of all my variables, this is by far the one I am most passionate about.
v059
, not_english_prof
)I chose this variable because language and the barriers it creates in health care is interesting to me. I minored in Hispanic Studies in undergrad and taught at a school where English was not many of the students first language. Throughout my rotations in pharmacy school, I was constantly translating at the hospital. I have observed how health is greatly affected not only by health literacy, but also by the inability to properly communicate with the health care provider. I believe that there should be an interaction between this and years of potential life lost.
I chose to create this variable as my binary variable. I believe converting the percent of people not proficient in English into a binary variable would make it easier to define those counties proficient in English and those that are not. This seems very clear to me. I first made a histogram to look at the distribution of counties with % not English proficient. I found that the majority of the counties had a very low percentage of people who weren’t English proficient. It seemed rational to me that if a county had less than 90% English proficient (>10% not proficient), then that would represent a good cut off for inequality. Furthermore, this cut off would at least capture 21 counties that had a % not English proficient above 10%
ggplot(data = chr_2020, aes(x = not_english_prof)) + geom_histogram(bins=20) +
labs (title="Spread of % not proficient in English across 334 counties", x="% of a county not proficient in English")
Below is a tibble of the data which verifies my variables and shows the first 10 rows of observations. Once again, we can see at the top that there are 334 rows, which are the data from all 334 of the counties across 5 states. This meets the requirement of selecting 200-400 counties.
Furthermore, the tibble shows us the type of the 10 variables we have:
3 factor variables: state
and the two I created, std_cut3
and not_english_prof_10pct
5 numerical variables: YPLL
, income
, std
, hs_grad
, not_english_prof
2 character variable: county
and fipscode
chr_2020
# A tibble: 334 x 10
state county fipscode YPLL income std hs_grad not_english_prof
<fct> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 CA Alame… 06001 44.1 5.21 553. 86.8 8.44
2 CA Alpin… 06003 328. 4.19 804. NA 0.542
3 CA Amado… 06005 63.6 4.25 171. 88.5 1.10
4 CA Butte… 06007 77.8 5.57 527. 84.5 2.50
5 CA Calav… 06009 73.5 4.64 166. 89.7 0.725
6 CA Colus… 06011 65.6 4.22 225. 89.3 16.0
7 CA Contr… 06013 46.2 4.74 500. 87.9 6.48
8 CA Del N… 06015 89.5 5.07 364 81.0 1.68
9 CA El Do… 06017 51.6 4.87 238. 89.4 1.73
10 CA Fresn… 06019 70.1 5.13 728. 81.5 12.0
# … with 324 more rows, and 2 more variables: not_english_prof_10pct <fct>,
# std_cut3 <fct>
Below I ran describe
from the Hmisc
package on my data. For now, the most important pieces of information from here are the missing values and the uniqueness of the observations. We can see that there are missing values for std
(n=3) and hs_grad
(n=1). The requirement is that we cannot have more than 25% missing values. Even if all of these N/As belonged to one state, it would still be less than 25% missing values (based on the number of counties each state has).
Another critical part of the information below is the minimum and maximum values for each of the variables. From these minimum and maximum values for each variable we can evaluate if these values seem plausible. If not, we can explore them deeper.
Lastly, we can also see the number of counties meets the required number of counties for each state by the n value.
Hmisc::describe(chr_2020)
chr_2020
10 Variables 334 Observations
--------------------------------------------------------------------------------
state
n missing distinct
334 0 5
lowest : OH CA MN NY WA, highest: OH CA MN NY WA
Value OH CA MN NY WA
Frequency 88 58 87 62 39
Proportion 0.263 0.174 0.260 0.186 0.117
--------------------------------------------------------------------------------
county
n missing distinct
334 0 300
lowest : Adams County Aitkin County Alameda County Albany County Allegany County
highest: Yakima County Yates County Yellow Medicine County Yolo County Yuba County
--------------------------------------------------------------------------------
fipscode
n missing distinct
334 0 334
lowest : 06001 06003 06005 06007 06009, highest: 53069 53071 53073 53075 53077
--------------------------------------------------------------------------------
YPLL
n missing distinct Info Mean Gmd .05 .10
334 0 334 1 70.53 23.67 42.60 45.72
.25 .50 .75 .90 .95
54.88 67.95 81.41 96.54 106.06
lowest : 32.06632 33.67811 33.69602 35.61596 35.79081
highest: 126.58897 130.60487 130.84896 180.46917 327.94178
--------------------------------------------------------------------------------
income
n missing distinct Info Mean Gmd .05 .10
334 0 334 1 4.392 0.6715 3.603 3.727
.25 .50 .75 .90 .95
3.954 4.326 4.664 5.052 5.555
lowest : 3.029475 3.265478 3.322142 3.363875 3.391893
highest: 6.824572 7.017767 7.028430 7.068538 9.206592
--------------------------------------------------------------------------------
std
n missing distinct Info Mean Gmd .05 .10
331 3 317 1 338.2 181.6 133.1 168.2
.25 .50 .75 .90 .95
219.1 296.0 403.5 576.7 662.5
lowest : 39.9 71.8 74.8 86.4 95.9, highest: 884.6 935.9 1001.4 1033.2 1203.9
--------------------------------------------------------------------------------
hs_grad
n missing distinct Info Mean Gmd .05 .10
333 1 329 1 86.63 7.64 75.38 78.24
.25 .50 .75 .90 .95
83.21 87.50 91.67 94.66 96.08
lowest : 36.30137 39.86254 49.96993 59.34000 64.59000
highest: 96.96970 96.97509 97.49478 98.53000 100.00000
--------------------------------------------------------------------------------
not_english_prof
n missing distinct Info Mean Gmd .05 .10
334 0 331 1 2.4 3.162 0.1194 0.1790
.25 .50 .75 .90 .95
0.3963 0.7752 2.2886 7.9776 10.9270
lowest : 0.00000000 0.05891595 0.05983545 0.06270575 0.06606441
highest: 15.83666782 16.02012579 17.29735909 18.81594636 20.74986071
--------------------------------------------------------------------------------
not_english_prof_10pct
n missing distinct
334 0 2
Value above10percent below10percent
Frequency 21 313
Proportion 0.063 0.937
--------------------------------------------------------------------------------
std_cut3
n missing distinct
331 3 3
Value low middle high
Frequency 111 110 110
Proportion 0.335 0.332 0.332
--------------------------------------------------------------------------------
My 3 important checks are:
1.) Each variable has data for at least 75% of the counties in each state
2.) The raw versions of each of my five variables has at least 10 distinct non-missing values
3.) I have at least 10 counties in each category for each of the categorical variables that I created
We only have missing data for std
(n=3) and hs_grad
(n=1). Washington accounts for 2 of the missing std
observations, but because it has 39 counties in total, there is only 5% (2/39) missing data on this variable for this state. MN has 1 missing value for std
, but since there are 87 counties in MN, this represents only 1.1% (1/87) missing values. Lastly, there is one missing observation for hs_grad
, which is in CA. CA has 58 counties, therefore only 1.7% (1/58) of the observations for this variable are missing in CA.
mosaic::favstats(std ~ state, data = chr_2020) %>%
select(state, n, missing) %>%
mutate(pct_available = 100*(n - missing)/n) %>%
kable()
state | n | missing | pct_available |
---|---|---|---|
OH | 88 | 0 | 100.00000 |
CA | 58 | 0 | 100.00000 |
MN | 86 | 1 | 98.83721 |
NY | 62 | 0 | 100.00000 |
WA | 37 | 2 | 94.59459 |
mosaic::favstats(hs_grad ~ state, data = chr_2020) %>%
select(state, n, missing) %>%
mutate(pct_available = 100*(n - missing)/n) %>%
kable()
state | n | missing | pct_available |
---|---|---|---|
OH | 88 | 0 | 100.00000 |
CA | 57 | 1 | 98.24561 |
MN | 87 | 0 | 100.00000 |
NY | 62 | 0 | 100.00000 |
WA | 39 | 0 | 100.00000 |
Here we can see that the raw versions of each of my five variables has more than 10 distinct non-missing values.
chr_2020 %>%
summarize(across(YPLL:not_english_prof, ~ n_distinct(.))) %>%
kable()
YPLL | income | std | hs_grad | not_english_prof |
---|---|---|---|---|
334 | 334 | 318 | 330 | 331 |
Below we can see that the two categorical variables I created, std_cut3
and not_english_prof_10pct
, have at least 10 counties in each level.
chr_2020 %>% tabyl(std_cut3)
std_cut3 n percent valid_percent
low 111 0.332335329 0.3353474
middle 110 0.329341317 0.3323263
high 110 0.329341317 0.3323263
<NA> 3 0.008982036 NA
chr_2020 %>% tabyl(not_english_prof_10pct)
not_english_prof_10pct n percent
above10percent 21 0.06287425
below10percent 313 0.93712575
I am saving this tibble as an R data set in the same location as my original data set within my Project A directory.
saveRDS(chr_2020, file = "chr_2020_Lindsay_Petrenchik.Rds")
The most challenging part of this proposal was choosing variables. First, I went to tutoring and learned about a neat code to evaluate the percentage of NAs for each of the 107 variables across the data set. This code was: (sapply(chr_2020_raw, function(x) round(sum(is.na(x))/3194,2)*100)). I thought I would choose neat variables with maybe 5-10% missing values. Therefore, I evaluated this large list to determine which fit the criteria of <25% missing and then matched them up in the codebook. After matching the raw variables in the codebook, I made a list of which would be interesting, relevant, and have a pattern with others that I was choosing. After creating the exhaustive list of potentially qualifying variables, I realized it was perhaps too random. I was worried I may not see any associations and have a terrible project. So, I went back to the drawing board and disregarded this list. Next, I researched which factors are associated with premature death. Unfortunately, many of the supported factors I found in the literature, such as tobacco and alcohol use, weren’t continuous variables we could choose from. However, education and income were, so that was at least a step in the right direction. I just needed two more. I went back and forth between various variables including diabetes prevalence, motor vehicle accidents, violence, and I can’t even remember what else. Finally, I decided STD prevalence and % not English proficient and hoped for the best. I still regret some of my variables (even my outcome variable because it’s so difficult to interpret) and I am tempted to change them for the fifth or sixth time, but I should probably start preparing for the quiz.
In this section I will build a simple linear regression model to predict my outcome, years of potential life lost ( YPLL
) using a quantitative predictor, income inequality (income
).
Description of variables:
Years of potential life lost (YPLL
), also known as ‘premature death’ describes the years of potential life lost under age 75 per 1000 people. The units are losses per 1000 population
Income inequality (income
) describes the ratio of household income at the 80th percentile to income at the 20th percentile. This value is unitless.
Tibble
The tibble below, titled “Cuyahoga Income Inequality and YPLL”, specifies the values of the predictor, income
, and the outcome, YPLL
, for Cuyahoga County, in Ohio, where CWRU’s campus is located
income inequality: 5.6
years of potential life lost: 90.9 losses per 1000 population
income_inequality_and_YPLL <-
chr_2020 %>%
filter(county == "Cuyahoga County") %>%
select(state, county, income, YPLL )
income_inequality_and_YPLL
# A tibble: 1 x 4
state county income YPLL
<fct> <chr> <dbl> <dbl>
1 OH Cuyahoga County 5.63 90.9
Complete Data
Although the tibble above only displays the values for Cuyahoga County, the linear regression will include data from all of the counties with complete data from the 5 states I am studying: CA, MN, NY, WA, OH.
The tables below indicate that all 334 counties have complete data on both variables.
chr_2020 %>%
filter (income > 9.1) %>%
select (state, county, income, YPLL)
# A tibble: 1 x 4
state county income YPLL
<fct> <chr> <dbl> <dbl>
1 NY New York County 9.21 39.3
mosaic::favstats(~income, data=chr_2020) %>%
kable(digits = 1)
min | Q1 | median | Q3 | max | mean | sd | n | missing | |
---|---|---|---|---|---|---|---|---|---|
3 | 4 | 4.3 | 4.7 | 9.2 | 4.4 | 0.7 | 334 | 0 |
mosaic::favstats(~YPLL, data=chr_2020) %>%
kable(digits = 1)
min | Q1 | median | Q3 | max | mean | sd | n | missing | |
---|---|---|---|---|---|---|---|---|---|
32.1 | 54.9 | 68 | 81.4 | 327.9 | 70.5 | 24.7 | 334 | 0 |
Is higher income inequality associated with higher rates of premature mortality in the 334 counties of Ohio, Minnesota, California, New York and Washington?
The scatterplot below shows the negligible positive association between the predictor, income inequality (income
) and years of potential life lost (YPLL
), also known as ‘premature death’.
The weakness in the relationship is indicated by a clustering of points between an income inequality of 3.5 to 5, with a few extreme outliers. The resulting essentially horizontal straight fit line indicates no relationship between the two variables.
ggplot(chr_2020, aes(x = income, y = YPLL)) +
geom_point(alpha=0.3) +
geom_jitter(pch = 1) +
geom_smooth(method = "lm", se = FALSE, formula = y ~ x, col = "red") +
geom_smooth(method = "loess", se = TRUE, formula = y ~ x, col = "blue", span = 0.5) +
labs(title = "Higher income inequality is not associated with higher premature death") +
labs(subtitle = "in top ranked states for premature death (CA, MN, NY, WA) plus OH (counties=334)", x= "income inequality (ratio of household income at 80th percentile to income at 20th percentile)", y="Years of potential life lost (losses per 1000 persons)") +
annotate("text", x = 7, y = 200, col = "purple", size = 6,
label = paste("Pearson r = ", signif(cor(chr_2020$income, chr_2020$YPLL),2))) +
annotate("text", x = 7, y = 0, col = "red", size = 5,
label = paste("y = ", signif(coef(lm(chr_2020$YPLL ~ chr_2020$income))[1],3), " + ",
signif(coef(lm(chr_2020$YPLL ~ chr_2020$income))[2],2), "x"))
The transformations I applied to the linear regression model were a log transformation, a square transformation, and an inverse transformation. I applied the transformations to the outcome alone and then also tried transforming both the predictor and the outcome. I evaluated the transformed data on scatterplots and then compared them to the untransformed scatterplot.
The loess lines were further away from the linear fit line for the all of transforms applied, as they were more curved than the loess lines for the untransformed data.
The one plot which was the most promising of the transformations was the log transformation, which is shown below. This was the most promising transformation according to box-cox, which suggested a transformation close to 0 (log) and by visualization which showed the least curvature from the straight fit line. Nevertheless, as you can see below, the loess line still curves away more from the straight fit line than the straight fit line.
car::boxCox(chr_2020$YPLL ~ chr_2020$income)
car::powerTransform(chr_2020$YPLL ~ chr_2020$income)
Estimated transformation parameter
Y1
-0.3413662
pl1<- ggplot(chr_2020, aes(x = income, y = YPLL)) +
geom_point(alpha=0.3) +
geom_jitter(pch = 1) +
geom_smooth(method = "lm", se = FALSE, formula = y ~ x, col = "red") +
geom_smooth(method = "loess", se = FALSE, formula = y ~ x, col = "blue") +
labs(title = "Untransformed outcome (YPLL)") +
labs( x= "income inequality", y="YPLL")
pl2<-ggplot(chr_2020, aes(x = income, y = log(YPLL))) +
geom_point() +
geom_smooth(method = "loess", se = FALSE, formula = y ~ x, col = "blue") +
geom_smooth(method = "lm", col = "red", formula = y ~ x) +
labs(title = "Log transformed (YPLL)", x= "income inequality", y="log YPLL")
pl1/pl2
Provided that I chose not to transform for the first analysis, none of the other analyses were transformed for consistency of interpretation.
I fit a model, “mod_1”, to use income
to predict YPLL
using the code below. The tidy version displays the estimates of the slope and intercept as well as the 90% CI for those estimates.
The straight line model for these data fitted by least squares is YPLL
= 64.7 + 1.3income
The intercept (64.7) is the predicted YPLL
for a county with an income inequality of 0. Therefore, we expect that if there is no income inequality in a county, then the years of potential life lost will be 64.7 losses per 1000 population.
The slope of income
is positive, which indicates that as income inequality increases, we expect that YPLL
will also increase. Specifically, we expect that for every additional unit of income inequality, the years of potential life lost will increase by an average of 1.3 losses per 1000 population.
From the confidence interval output, we are 90% confident that the true slope for income
is between (-2.0, 4.7). The confidence interval for the slope includes zero, which means if the slope was in fact zero, then this would mean that knowing income inequality information would be of no additional value in predicting years of potential life lost over just guessing the mean of YPLL
for every county.
We are also 90% confident that the true intercept is between (49.8, 79.6). This is statistically significant.
mod_1 <- lm(YPLL ~ income, data = chr_2020)
tidy(mod_1,
conf.int = TRUE, conf.level = 0.90) %>%
kable(digits = 1)
term | estimate | std.error | statistic | p.value | conf.low | conf.high |
---|---|---|---|---|---|---|
(Intercept) | 64.7 | 9 | 7.2 | 0.0 | 49.8 | 79.6 |
income | 1.3 | 2 | 0.7 | 0.5 | -2.0 | 4.7 |
To assess how well the relationship between income
and YPLL
can be described using a linear function, a Pearson correlation coefficient was calculated for the 334 observations. The correlation was close to zero (0.036). An r=0 describes a situation where there is no linear situation, therefore this is extremely close to there not being a linear association.
chr_2020%$% cor(YPLL, income) %>% kable(digits=4)
x |
---|
0.036 |
This linear model for the 334 counties has an R-squared = 0.13%. This indicates that 0.13% of the variation in YPLL
is explained by using this linear model with income
Sigma, the residual standard error is 25, which is extremely large.
glance(mod_1, data = chr_2020) %>% select(r.squared, sigma, nobs) %>% kable(digits = c(4, 0, 0))
r.squared | sigma | nobs |
---|---|---|
0.0013 | 25 | 334 |
I prepared a pair of residual plots below to assess:
residuals vs. fitted values for non-linearity
Normality in the residuals
I added the county name for the 2 largest residuals.
Interpretation of Plots
residuals vs. fitted: the significant features of the plot are (1) the pronounced curve from the loess line. On average, the predicted YPLL
was too low. (2) absence of the idea fuzzy football shape due to the clustering of points (3) two extreme outliers: Alpine County and Mahnomen County. This does not support the assumption of linearity.
Normal Q-Q plot: slight right skew with the two extreme outliers mentioned above. This does not support the assumption of Normality.
Comparing Predicted YPLL to observed YPLL for Cuyahoga County
Predicted: 72 losses per 1000 people
Observed: 91 losses per 1000 people
mod_1_aug %>%
filter(county == "Cuyahoga County") %>%
select(state, county, YPLL, .fitted, .resid,
income) %>% kable(digits = c(0,0,0,0,0,1))
state | county | YPLL | .fitted | .resid | income |
---|---|---|---|---|---|
OH | Cuyahoga County | 91 | 72 | 19 | 5.6 |
Two Counties with the largest residuals
The two counties with the highest residuals in absolute value were Alpine County and Mahnomen County, which are located in CA and MN, respectively.
mod_1_aug %>%
filter(county %in% c("Alpine County","Mahnomen County")) %>%
select(state, county, income, YPLL, .fitted, .resid,
income) %>% kable(digits = c(0,0,0,0,0,1))
state | county | income | YPLL | .fitted | .resid |
---|---|---|---|---|---|
CA | Alpine County | 4 | 328 | 70 | 257.7 |
MN | Mahnomen County | 5 | 180 | 71 | 109.8 |
My research question was, “Is higher income inequality associated with higher rates of premature mortality in the 334 counties of Ohio, Minnesota, California, New York and Washington?” In summary, the results indicate that no, years of potential life lost is not higher in states with higher income inequality. Although the linear regression model predicted that YPLL
would be 1.03 losses per 1000 people higher for each 1 unit increase in the proportion of people at the 80th income percentile to the 20th income percentile, this is such a minute value that it is not meaningful, and nor is it actually statistically significant according the 90% confidence interval (-2, 4.7). Furthermore, the residual standard error was extremely high (sigma=25). On average, the predictions were considerably lower than the observed values, with the most extreme prediction being 103 losses/ 1000 people too low. Resultingly, the residuals were far from normal and were not linear. The large prediction errors are explained by the lack of a linear relationship between the two variables (r=0.036), where only 0.13% of the variation in YPLL
was explained by income
. The lack of a relationship is displayed beautifully on the scatterplot where we can see an essentially horizontal straight fit line, clustering of points, and a loess line that curves away from the straight fit line. The lack of a linear relationship is consistent with what was described on the County Health Rankings website because they specifically stated that income was not one of the variables associated with premature death (only age, gender, race, and education). Although I had seen this while picking variables, I thought that the relationship still could potentially exist if I used states with similar premature death rankings (besides OH) because their analysis included counties across all 50 states. I had presumed a relationship might be possible because I had read a study regarding factors that are associated with premature death and income was one of them. However, that study was based in India, which is a completely different population and is clearly not generalizable to the United States. Provided that there is in fact no linear relationship between income and premature death in the United States (at least without further adjustment for confounders), a transformation could not make a relationship magically appear. In fact, all transformations applied (even when they weren’t one of Dr. Love’s 3 recommended) did not improve the linear relationship.
Although I was unable to demonstrate a linear relationship between income inequality and premature death, there could have been some problems with the data collection methods and analysis methods. First, there was no temporality because data collection for premature death began in 2016 (3 year average from 2016-2018) and data collection for income inequality began in 2017. Second, the sampling methods for income inequality may be inadequate. Perhaps there was a vulnerable population that was not captured in the county’s survey on income inequality due to non-response. Third, it is still hard for me to understand how income inequality is not somehow associated with premature death provided that income contributes to access to care, health behaviors, neighborhood violence, education, etc, all of which can contribute to dying earlier from preventable causes. Therefore, I think there must be negative confounders somewhere that we don’t know about and they need to be adjusted for. Fourth, there were some extreme outliers. I investigated outliers on the County Health Rankings website both in terms on income inequalities that may not be plausible and YPLLs that were questionable. For example, a typical income inequality was 4.3, but we saw New York County had an income inequality more than double that at 9.2. Based on living in NYC and evaluating other related measures like Children in Poverty, this value didn’t seem impossible (although they didn’t have values for previous years to truly compare). I also further investigated the potential validity of the two major residual outliers, Alpine County and Mahnomen County, and relative to the previous years their observed values seemed rational because they were also disproportionately high and growing over time. Therefore if we decided to do an analysis where we didn’t include impossible values, there isn’t enough evidence to throw out our outliers in order to improve the model. In summary, I thought Spiegelhaters’ citation of George Box’s quote, “All models are wrong, some are useful” was extremely fitting for this apparently terrible model.
Lastly, this model is clearly not representative of all counties across the United States, especially because the characteristics of the states I chose. When I was originally choosing states, I thought I was choosing states with the highest premature mortality (i.e. states ranked 1-5 would have the highest YPLL in comparison to the rest of the United States). After careful analysis, I noticed that the range of YPLL
was actually higher for OH (ranked #42 ) than the other states, which was slightly confusing to me. I then went back on the America’s Health Ranking website (https://www.americashealthrankings.org/explore/annual/measure/YPLL/state/ALL) and realized that by choosing the top ranked states, I was actually choosing states with the lowest premature death. If I were to choose states with the highest premature death they would be West Virginia, Mississippi, Alabama, and Kentucky, which are ranked 50, 49, 48, and 47, respectively.Perhaps, the analysis would have shown more of a relationship with these states.
Next I will be building a linear regression model to predict my outcome variable, years of potential life lost, YPLL
from the binary variable % not English proficient, not_english_prof_10pct
.
There was complete data for these variables across the 334 counties in the 5 states included: CA, MN, NY, WA, and OH, which can be seen from the two tables below.
mosaic::favstats(not_english_prof ~ not_english_prof_10pct, data = chr_2020) %>%
kable(digits = 1)
not_english_prof_10pct | min | Q1 | median | Q3 | max | mean | sd | n | missing |
---|---|---|---|---|---|---|---|---|---|
above10percent | 10.2 | 11.2 | 13.2 | 15.0 | 20.7 | 13.6 | 2.9 | 21 | 0 |
below10percent | 0.0 | 0.4 | 0.7 | 1.9 | 10.0 | 1.6 | 2.2 | 313 | 0 |
mosaic::favstats(~ YPLL, data=chr_2020) %>%
kable(digits = 1)
min | Q1 | median | Q3 | max | mean | sd | n | missing | |
---|---|---|---|---|---|---|---|---|---|
32.1 | 54.9 | 68 | 81.4 | 327.9 | 70.5 | 24.7 | 334 | 0 |
Years of potential life lost (YPLL
) is a continuous variable, as indicated in the tibble below by the denotation ‘dbl’. Years of potential life lost describes premature death. The numerator is the number of years of potential life lost for deaths that occurred among people who reside in a county under the age 75 for 3 years. The denominator is the sum of the population younger than 75 years old. The value is then multiplied by 1000 to calculate the years of potential life lost under age 75 per 1000 people. Thus, the unit of measurement is losses per 1000 population.
Percent not English proficient (not_english_prof_10pct
) indicates the percent of a county that is not proficient in English. This is a categorical variable, as indicated in the tibble below by the denotion ‘fctr’ and the two levels: ‘above10percent’ and ‘below10percent’. These groups represent if ≥10% of the county has residents not proficient in English and <10% of the county not proficient in English, respectively.
The tibble below, ‘not_English_and_YPLL’ displays the binary predictor variable, not not_english_prof_10pct
and the continuous outcome variable, YPLL
, and their characteristics, as described above. Although there is complete data for all 334 counties in CA, MN, NY, WA, and OH, I am only showing the observations for Cuyahoga County, where the best university ever is located.
From the tibble we can see that in Cuyahoga county, the percent of people who are not proficient in English is less than 10% and the years of potential life lost is about 91 losses per 1000 people.
not_English_and_YPLL <-
chr_2020 %>%
filter(county == "Cuyahoga County") %>%
select(state, county, not_english_prof_10pct, YPLL )
not_English_and_YPLL
# A tibble: 1 x 4
state county not_english_prof_10pct YPLL
<fct> <chr> <fct> <dbl>
1 OH Cuyahoga County below10percent 90.9
Do counties with less than 10% of the population not proficient in English show lower levels of years of potential life lost in the 334 counties of OH, CA, MN, WA, NY?
I have provided a boxplot (with a violin plot) showing the outcome, YPLL
broken down by levels of the predictor,not_english_prof_10pct
: ‘above10percent’ and ‘below10percent’.
d1<-chr_2020 %>%
mutate(not_english_prof_10pct=fct_recode(not_english_prof_10pct,"≥10%"="above10percent", "<10%"="below10percent"))
d1 %>%
ggplot(aes(x=not_english_prof_10pct, y=YPLL))+
guides(fill=FALSE) +
geom_violin(aes(fill=not_english_prof_10pct))+
scale_fill_viridis_d(end = 0.6)+
geom_boxplot(width=0.3)+
coord_flip()+
labs(title="Years of Potential Life lost broken down by % not English proficient",subtitle = "334 counties in OH, CA, WA, NY, MN", y ="Age-Adjusted YPLL per 1000", x= "% not English proficient in the county") +
geom_text(aes(x = 1.2, y=200), size = 4, label = paste0("median, [IQR] = 62, [18]")) +
geom_text(aes(x = 0.9, y=200), size = 4, label = paste0("range = 42 - 79"))+
geom_text(aes(x = 2.2, y=200), size = 4, label = paste0("median, [IQR] = 69, [27]" )) +
geom_text(aes(x = 1.9, y=200), size = 4, label = paste0("range = 32 - 327 "))
mosaic::favstats(YPLL~ not_english_prof_10pct, data = chr_2020) %>% kable(digits = 1)
not_english_prof_10pct | min | Q1 | median | Q3 | max | mean | sd | n | missing |
---|---|---|---|---|---|---|---|---|---|
above10percent | 42.0 | 50.2 | 62.1 | 68.2 | 79.2 | 59.7 | 11.8 | 21 | 0 |
below10percent | 32.1 | 55.4 | 68.6 | 82.3 | 327.9 | 71.3 | 25.2 | 313 | 0 |
Significant features of the plots:
The ≥10% not English proficient group has a bimodal distribution and the data is scrunched up (lack of spread)
The <10% not English proficient group has some extreme outliers on the high end and lack of not as many values as we would expect on the lower end
The mean and median are not the same: the bold line is not in the center of either of the boxes
YPLL
were smaller in the less English proficient group, with the maximum being 81 losses per 1000 people. This will be further investigated and addressed in the conclusion.The calculated predicted equation for the log of years of potential life lost (YPLL) is: YPLL
= 59.7 + [(11.6)(below10percent)]
intercept = 59.7 : if a county has >10% of its population not proficient in English, the years of potential life lost will be 59.7 losses per 1000 people.
slope = 11.6 : the slope of YPLL is 11.6 with a 90% confidence interval that its between 2.4 and 20.7. This model predicts that a county that has <10% of its residents not proficient in English will have a years of potential life lost that is 11.6 losses per 10000 people higher than a county with ≥10% of its residents not proficient in English.
Note: As described above, the baseline category for the predictor not_english_prof_10pct
is “above10percent”. As we can see by the output below, above10percent is the one that is left out.
mod_2 <- lm(YPLL ~ not_english_prof_10pct, data = chr_2020)
tidy(mod_2, conf.int = TRUE, conf.level = 0.90) %>% select(term, estimate, std.error, conf.low, conf.high) %>% kable(digits = 1)
term | estimate | std.error | conf.low | conf.high |
---|---|---|---|---|
(Intercept) | 59.7 | 5.4 | 50.9 | 68.5 |
not_english_prof_10pctbelow10percent | 11.6 | 5.5 | 2.4 | 20.7 |
The r squared is 0.013, which implies that 1.3% of the variation in YPLL
is explained by using this linear model with not_english_prof_10pct
Sigma, the residual standard error is 25, which is extremely large.
glance(mod_2, data = chr_2020) %>% select(r.squared, sigma, nobs) %>% kable(digits = c(3, 0, 0))
r.squared | sigma | nobs |
---|---|---|
0.013 | 25 | 334 |
I plotted the residuals against the categorical predictor, not_english_prof_10pct
, in a useful way to help assess normality of the residuals within both categories: non-English proficiency above 10% and non-English proficiency below 10%.
Residuals vs fitted: This divides the counties into two groups and assigns the mean for each group. The result is a bunch of counties with a prediction of ~60 losses/ 1000 people (<10% non-English proficient) and ~71 losses/1000 people (≥10% non-English proficient) as indicated by the two vertical lines. Provided that the predictor is a binary variable, we do not see the fuzzy football, like we could potentially see with a continuous predictor. We can see the two major outliers, Alpine County and Mahnomen County. We learn much more from the QQ plot and the box plot.
QQ plot: We can see a little bit of right skew here with the two major outliers previously identified on the residual vs. fitted plot, Alpine County and Mahnomen County. This is not normal.
Box plot with violin: we can see the outliers are causing a substantial right skew. The data is not normally distributed.
Below I have displayed the model’s prediction for the outcome, YPLL
for Cuyahoga County, in Ohio, and I have compared it to Cuyahoga’s actual value of YPLL
.
Predicted: 71 losses per 1000 people
Observed: 91 losses per 1000 people
Residual: 20 losses per 1000 people. Therefore, the difference between the predicted (fitted) and observed was 20.
aug2 %>%
filter(county == "Cuyahoga County") %>%
select(state, county, YPLL, .fitted, .resid,
not_english_prof_10pct) %>% kable(digits = c(0,0,0,0,0,1))
state | county | YPLL | .fitted | .resid | not_english_prof_10pct |
---|---|---|---|---|---|
OH | Cuyahoga County | 91 | 71 | 20 | below10percent |
I have identified the two counties (by name and state) where model 2 is least successful at predicting the outcome, YPLL
(in the sense of having the largest residual in absolute value)
Alpine County, CA: largest residual (off by 257 losses/1000 people)
Mahnomen County, MN: second largest residual (off by 109 losses/1000 people)
aug2 %>%
filter(county %in% c("Alpine County","Mahnomen County")) %>%
select(county, state, not_english_prof_10pct, YPLL, .fitted, .resid) %>% kable (digits=0)
county | state | not_english_prof_10pct | YPLL | .fitted | .resid |
---|---|---|---|---|---|
Alpine County | CA | below10percent | 328 | 71 | 257 |
Mahnomen County | MN | below10percent | 180 | 71 | 109 |
In conclusion, my research question, “Do counties with less than 10% of the population not proficient in English show lower levels of years of potential life lost in the 334 counties of OH, CA, MN, WA, NY?” This was not supported by the results. Contrary to my hypothesis, YPLL
was higher (statistically, but not meaningfully) in counties with non-English proficiency <10%. The sample mean YPLL for the counties with non-English proficiency ≥ 10% was 59.7 losses/1000 people. The sample mean YPLL for the counties with non-English proficiency < 10% was 71.3 losses/1000 people. The point estimate for the mean difference in non-English proficiency < 10% and non-English proficiency ≥10% is therefore 11.6 losses/1000 people. Although the 90% CI for the difference was significant, it was incredibly wide and shows how meaningless this model could be in predicting YPLL
from not_english_prof_10pct
(90% CI=2.4, 20.7). If the true difference was in fact only 2.4 losses/1000 people, this is not a meaningful value at all and would not inspire policy makers to focus on improving English proficiency in order to decrease YPLL (also this would be unlikely anyway due to limitations in the study design). Furthermore, only 1.3% of the variation in YPLL
was explained by the binary variable, not_english_prof_10pct
, so this really isn’t a very useful predictor. The disutility of this predictor can be further supported by the huge errors of the fitted values. The sigma was 25 and our largest residual error was 257 losses/1000 people.
There were various limitations to this model, as we can see by our extremely wide confidence interval, large high residual standard error, and very small r-squared . First, there was an incredibly small sample size in the group not English proficient ≥10% (21 vs 313). I believe that there could be more counties that fit into the non-English proficient >10% group, especially in New York and California where there are more immigrants. I have read about the validity of the instrument used, so I do not think that was the problem, but rather, that vulnerable people within the county were not represented in the survey due to selection bias. Second, there were two major outliers in the <10% not English proficient group. Third, the instructions stated, “your model will assume that the distribution of the outcome is Normal, with similar variance across each level of your categorical predictor.” In reality, the distribution of my outcome was not normal, so the method of estimation (a linear model is a t-test) was really not practical and instead should have been a bootstrap. Fourth, there is no adjustment as we are just doing a univariate analysis and there could be many confounders. Spiegelhalter discusses Simpson’s paradox, “which occurs when the apparent direction of an association is reversed by adjusting for a confounder.” (Speigelhalter, 112). I believe that in addition to the small sample size, there may be problems with the model due to confounders which actually made the relationship appear that there is higher premature mortality in counties with higher English proficiency, when perhaps it is flipped (that is if there actually is a true relationship). Fifth, I believe the model could have slightly improved if I used more categories of English proficiency rather than limiting it to just two, but regardless, it is clear that it is not helpful to build categories for intrinsically quantitative data. Sixth, for consistency of interpretation across models, a transformation was not applied. According to Leek, log transformations should be applied to ratio measurements because it will make distribution more symmetric. When a log transformation was applied, the box plots looked considerably better (and this was the best transformation of the 3). However, the log transformation worsened the linear relationship for the other two models, and therefore to be consistent, the log transformation was not used here. Lastly, these results are not generalizable to all of the United States, especially because I used 4 states with similar premature death rates, plus one state, OH, with a vastly different premature mortality rate.
In this section I will build a linear regression model to predict my outcome, years of potential life lost ( YPLL
) using a quantitative predictor, percentage of the population that graduated high school (hs_grad
), and a categorical variable state (state
).
Description of variables:
Years of potential life lost (YPLL
), also known as ‘premature death’ describes the years of potential life lost under age 75 per 1000 people. The units are losses per 1000 population
High school graduation (hs_grad
), a quantitative predictor, describes the percentage of the county’s ninth grade cohort that graduates with a high school diploma in four years. The unit is %.
state
includes the states with the ranks 1,2,3, and 4 for premature mortality rate, which are CA, MN, NY, WA, respectively, plus OH
Tibble
The tibble below, titled “Hs_grad_state_and_YPLL”, specifies the values of the predictor, hs_grad
, the state, state
, and the outcome, YPLL
, for Cuyahoga County, where CWRU’s campus is located
high school graduation: 86.4%
state: OH
years of potential life lost: 90.9 losses per 1000 population
Hs_grad_state_and_YPLL <-
chr_2020 %>%
filter(county == "Cuyahoga County") %>%
select(county, state, hs_grad, YPLL )
Hs_grad_state_and_YPLL
# A tibble: 1 x 4
county state hs_grad YPLL
<chr> <fct> <dbl> <dbl>
1 Cuyahoga County OH 86.4 90.9
Complete Data
Although the tibble above only displays the values for Cuyahoga County, the linear regression will include data from all of the counties with complete data from the 5 states I am studying: CA, MN, NY, WA, OH.
The tables below indicate that there is one missing value for the variable hs_grad
and no missing values for YPLL
among the 334 counties.
mosaic::favstats(~hs_grad, data=chr_2020) %>%
kable(digits = 1)
min | Q1 | median | Q3 | max | mean | sd | n | missing | |
---|---|---|---|---|---|---|---|---|---|
36.3 | 83.2 | 87.5 | 91.7 | 100 | 86.6 | 7.6 | 333 | 1 |
mosaic::favstats(~income, data=chr_2020) %>%
kable(digits = 1)
min | Q1 | median | Q3 | max | mean | sd | n | missing | |
---|---|---|---|---|---|---|---|---|---|
3 | 4 | 4.3 | 4.7 | 9.2 | 4.4 | 0.7 | 334 | 0 |
I created a new data set, chr_2020_hs_cc, which only includes the complete cases for hs_grad
chr_2020_hs_cc <- chr_2020 %>%
filter(complete.cases(hs_grad))
Sanity check #1:
The tibble below describes the complete case dataset, chr_2020_hs_cc, that I will use for this analysis. There are 333 counties included in this dataset because the one county with missing data has been dropped.
chr_2020_hs_cc
# A tibble: 333 x 10
state county fipscode YPLL income std hs_grad not_english_prof
<fct> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 CA Alame… 06001 44.1 5.21 553. 86.8 8.44
2 CA Amado… 06005 63.6 4.25 171. 88.5 1.10
3 CA Butte… 06007 77.8 5.57 527. 84.5 2.50
4 CA Calav… 06009 73.5 4.64 166. 89.7 0.725
5 CA Colus… 06011 65.6 4.22 225. 89.3 16.0
6 CA Contr… 06013 46.2 4.74 500. 87.9 6.48
7 CA Del N… 06015 89.5 5.07 364 81.0 1.68
8 CA El Do… 06017 51.6 4.87 238. 89.4 1.73
9 CA Fresn… 06019 70.1 5.13 728. 81.5 12.0
10 CA Glenn… 06021 73.6 5.26 395. 81.7 9.98
# … with 323 more rows, and 2 more variables: not_english_prof_10pct <fct>,
# std_cut3 <fct>
How well does the percentage of a county that graduates high school predict a county’s years of potential life lost after accounting for differences between states in the counties with complete data in Ohio, Minnesota, California, New York and Washington?
I have provided a scatterplot of my outcome, YPLL
and my quantitative predictor, hs_grad
and faceted by state
.
ggplot(chr_2020_hs_cc, aes(x = hs_grad, y = YPLL, col = state)) +
geom_point(alpha=0.1) +
geom_jitter(pch = 1) +
facet_wrap(~ state) +
geom_smooth(method = "lm", se = FALSE, formula = y ~ x, col = "black") +
geom_smooth(method = "loess", se = FALSE, formula = y ~ x, col = "red") +
guides(color = FALSE) +
theme(strip.text = element_text(face="bold", size=rel(1.25), color="white"),
strip.background = element_rect(fill="royalblue")) +
labs(title = "Association of % of high school graduates with years of potential life lost (YPLL)", subtitle= "Stratified by state (N=333 counties)",
x= "Percent of high school graduates per county (%)", y = "YPLL (losses/1000 people)")
The pattern appears to be:
Indirect, or negative, in that as the values of hs_grad
increases, the YPLL
decreases. This is true for all states according to their best fit lines.
Not linear: The smooth line curves away from the line of best fit for each state. The curvature is the most pronounced in CA and MN where there are many outliers on the low end.
Weak: the range of values for YPLL
associated with any particular value of hs_grad
is not tight. If we know a county’s percentage of the population that graduated high school, that should not meaningfully improve our ability to predict the years of potential life lost for that county, among counties in these data.
Unusual outliers: there are unusual outliers, especially in MN and CA. There appears to be fewer outliers in CA and NY. I am not sure if it is plausible that there is a county in CA where <40% of the county has graduated high school and furthermore that its years of potential life lost could be <50 losses/1000 people.
I fit a model, “mod_3”, to use hs_grad
and state
to predict YPLL
using the code below. The tidy version displays the estimates of the slope and intercept as well as the 90% CI for those estimates.
The straight line model for these data fitted by least squares is YPLL
=141.3 - 0.6hs_grad
- 27.2CA - 28.3MN -26.4NY -27.2WA
The intercept (141.3) is the predicted YPLL
for a county with a high school graduation of 0% and not in CA, WA, MN, or NY. Therefore, we expect that if nobody graduates high school in a county and the county is in not one of the states listed, then the years of potential life lost will be 141.3 losses per 1000 population.
The slope of hs_grad
is negative, which indicates that as high school graduation increases, we expect that YPLL
will decrease. Specifically, if we hold the state a county is in constant, every increase in a % of high school graduation, the years of potential life lost will decrease by 0.6 losses per 1000 population.
We are 90% confident that the true slope for hs_grad
is between (-0.8, -0.4). This is a fairly narrow confidence interval and it is significant (doesn’t cross zero)
Ohio is the indicator variable. Therefore, for a given county in OH, this model predicts that the average years of potential life lost for that county will be: YPLL
= 141.3 - 0.6hs_grad
The coefficient for the levels of the state are -27.2 for CA, -28.3 for MN, -26.4 for NY, and -27.2 for WA. This means that, relative to OH, California has a 27.2 fewer YPLL than OH on average, given equivalent hs_grad
. Similarly, relative to Ohio, given equivalent hs_grad
, MN, NY, and WA have on average - 28.3, -26.4, and -27.2 fewer YPLL.
The confidence intervals around each of the estimates for CA, MN, NY, and WA are significant. We are 90% confident that the true coefficients for each of these states are:
CA: -32.6, -21.9
MN: -32.9, -23.8
NY: -31.5, -21.4
WA: -33.1, -21.3
mod_3 <- lm(YPLL ~ hs_grad + state, data = chr_2020_hs_cc)
tidy(mod_3,
conf.int = TRUE, conf.level = 0.90) %>%
kable(digits = 1)
term | estimate | std.error | statistic | p.value | conf.low | conf.high |
---|---|---|---|---|---|---|
(Intercept) | 141.3 | 13.4 | 10.6 | 0 | 119.2 | 163.4 |
hs_grad | -0.6 | 0.1 | -4.1 | 0 | -0.8 | -0.4 |
stateCA | -27.2 | 3.3 | -8.4 | 0 | -32.6 | -21.9 |
stateMN | -28.3 | 2.7 | -10.3 | 0 | -32.9 | -23.8 |
stateNY | -26.4 | 3.1 | -8.7 | 0 | -31.5 | -21.4 |
stateWA | -27.2 | 3.6 | -7.6 | 0 | -33.1 | -21.3 |
The linear model including the interaction of state to predict YPLL
for the 333 counties with a known percentage of high school graduates has R-squared = 29%.
Sigma, the residual standard error is 17, which is fairly large.
glance(mod_3, data = chr_2020_hs_cc) %>% select(r.squared, sigma, nobs) %>% kable(digits = c(2, 0, 0))
r.squared | sigma | nobs |
---|---|---|
0.29 | 17 | 333 |
I prepared a pair of residual plots below to assess:
residuals vs. fitted values for non-linearity
Normality in the residuals
I added the county name for the 2 largest residuals.
Interpretation of Plots
residuals vs. fitted: the significant features of the plot are: (1) the pronounced curves in loess line (2) absence of the ideal fuzzy football shape due to the clustering of points between fitted = 60-70 and 85-90 YPLL. The clustering of the points between a fitted of 85-90 represent most of the predictions errors for OH. (3) two extreme outliers: Mono County and Mahnomen County. This does not support the assumption of linearity
Normal Q-Q plot: heavy right tails & light right tails in the residuals. This does not support the assumption of Normality.
mod_3_aug <- augment(mod_3, data=chr_2020_hs_cc)
p1 <- ggplot(mod_3_aug, aes(x = .fitted, y = .resid)) +
geom_point() +
geom_point(data=mod_3_aug %>%
slice_max(abs(.resid), n=2),
col="red", size=2) +
geom_smooth(method = "lm", formula = y ~ x, se = F,
lty = "dashed", col = "black") +
geom_smooth(method = "loess", formula = y ~ x, se = F,
col = "blue") +
geom_text_repel(data = mod_3_aug %>%
slice_max(abs(.resid), n = 2),
aes(label = county), size=3) +
labs(title = "Model 3 Residuals vs. Fitted YPLL",
x = "Fitted YPLL from Model 3",
y = "Residuals from Model 3")
p2 <- ggplot(mod_3_aug, aes(sample = .resid)) +
geom_qq() +
geom_qq_line(col = "red") +
labs(title = "Model 3 Residuals",
y = "")
p1 + p2 + plot_annotation(title = "Model 3 residuals: predicting YPLL with high school graduation & adjusting for state")
Comparing Predicted YPLL to observed YPLL for Cuyahoga County
Predicted: 90 losses per 1000 people
Observed: 91 losses per 1000 people
mod_3_aug %>%
filter(county == "Cuyahoga County") %>%
select(state, county, hs_grad, YPLL, .fitted, .resid) %>%
kable(digits = c(0,0,0,1,1,1))
state | county | hs_grad | YPLL | .fitted | .resid |
---|---|---|---|---|---|
OH | Cuyahoga County | 86 | 90.9 | 90.1 | 0.8 |
Two Counties with the largest residuals
The two counties with the highest residuals in absolute value were Mono and Mahnomen County, which are located in CA and MN, respectively.
mod_3_aug %>%
filter(county %in% c("Mono County","Mahnomen County")) %>%
select(county, state, hs_grad, YPLL, .fitted, .resid) %>% kable (digits=0)
county | state | hs_grad | YPLL | .fitted | .resid |
---|---|---|---|---|---|
Mono County | CA | 36 | 34 | 93 | -59 |
Mahnomen County | MN | 59 | 180 | 78 | 103 |
We hope to see the median of the residuals also be near zero. In this case, the median prediction is -0.9 losses/1000 people too low.
mosaic::favstats(~ .resid, data = mod_3_aug) %>% select(min,median, max) %>% kable (digits=1)
min | median | max | |
---|---|---|---|
-58.8 | -0.9 | 102.7 |
My research question was, “How well does the percentage of a county that graduates high school predict a county’s years of potential life lost after accounting for differences between states?” There is at least some evidence based on the r-squared (29%) and the significant slope (90% CI -0.8 to -0.4) that the model I built is more effective than a model that ignores hs_grad
and state information, but the prediction was still not impressive. It makes sense that adding this information was useful because according to the County Health Rankings website, education is one of the variables that is associated with premature death. Furthermore, I believe it was helpful that we adjusted for state because one of the limitations of the hs_grad
variable is that the values cannot be compared across states.
The prediction was not that great because the relationship was not perfectly linear. The scatterplots show the loess lines curved away from the best fit line for each state, with the most dramatic curvature seen in CA and MN due to extreme outliers. As a result of the not perfectly linear relationship, there were some considerable errors in predicting YPLL
(sigma=17). The range of prediction errors was -59 (Mono County, CA) to 103 (Mahnomen County, MN). Nevertheless, the median prediction was -0.9 losses/1000 people too low, so it was neat how close the predicted YPLL
was to the observed YPLL
for Cuyahoga county (90 vs 91).
There are several ways I think this model falls short, both in terms of how I analyzed it and the sampling method. First, and most importantly, there is no temporality in this relationship. The high school graduation data was collected from 2017-2018 and the premature death data was collected from 2016-2018 (a 3-year average). Temporality cannot be established because they started collecting premature death data before high school graduation data. If I were to redo this study, I would probably use County Health Ranking data from maybe year 2000 for the high school graduation data. However, a limitation to that approach is that people migrate, and they may not represent the same population that was measured 20 years later. This relates to the second limitation, ecological fallacy. These findings cannot be generalized to the individual level as we are using aggregate data. Third, there were some extreme outliers for both YPLL
and hs_grad
that I am not sure if they were miscalculated, measured incorrectly, or entered incorrectly. For some of the extremes I looked up on the County Health Rankings website to see if the county’s data was consistent with that value for previous years or relation to other measures. The most extreme hs_grad
was Mono County in CA (36.3%) which doesn’t seem plausible (especially relative to CA’s overall 83%). There was no data for prior years on this measure, however, 62% of this county has completed some college (overall CA 65%). It doesn’t seem rational that the percent of college attendance in a county is almost double the percent of the county that graduated high school. Regarding YPLL
there were also some extreme outliers, with Mahnomen County being the worst. As we have seen previously, this county consistently muddles with the analysis. However, YPLL
measured here does seem reasonable based on previous years data which was also high and continuously increasing. In addition to limitations in the data collection, there are also limitations to my analysis method. First, the analysis is not generalizable to all of the United States. I have selected states based on their national ranking of premature death. California, Minnesota, New York, and Washington rank 1, 2, 3, and 5 for premature death rates in the U.S and OH ranks 42. Furthermore, even if we were trying to predict the YPLL
for a state with a similar premature mortality rate (ranked number 4), we couldn’t use the prediction equation because there are only coefficients for the states included in the model. Second, none of the three transformations improved the linear relationship, conversely they worsened it (visually). Third, I applied a complete case analysis and therefore assumed the data was missing at random, which is a gigantic assumption and is almost never the case. Imputation may have slightly improved the model, nevertheless, it was only one missing value so the impact wouldn’t be substantial. Fourth, there was non-normality in the residuals and the residuals vs fitted plot was not a fuzzy football due to outliers (not linear). Fifth, although it is highly frowned upon to remove outliers from the data, careful investigation of the implausibly low high school graduation rate in Mono County CA supports potentially removing it from the data set.
sessionInfo()
R version 4.0.2 (2020-06-22)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Catalina 10.15.6
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] forcats_0.5.0 stringr_1.4.0 dplyr_1.0.2 purrr_0.3.4
[5] readr_1.3.1 tidyr_1.1.2 tibble_3.0.3 tidyverse_1.3.0
[9] viridis_0.5.1 viridisLite_0.3.0 ggrepel_0.8.2 ggplot2_3.3.2
[13] patchwork_1.0.1 magrittr_1.5 broom_0.7.0 janitor_2.0.1
[17] rmdformats_0.3.7 knitr_1.29
loaded via a namespace (and not attached):
[1] colorspace_1.4-1 rio_0.5.16 ellipsis_0.3.1
[4] leaflet_2.0.3 ggstance_0.3.4 snakecase_0.11.0
[7] htmlTable_2.0.1 base64enc_0.1-3 ggdendro_0.1.21
[10] fs_1.5.0 rstudioapi_0.11 farver_2.0.3
[13] fansi_0.4.1 lubridate_1.7.9 xml2_1.3.2
[16] mosaic_1.7.0 splines_4.0.2 mnormt_2.0.1
[19] polyclip_1.10-0 Formula_1.2-3 jsonlite_1.7.0
[22] cluster_2.1.0 dbplyr_1.4.4 png_0.1-7
[25] ggforce_0.3.2 compiler_4.0.2 httr_1.4.2
[28] backports_1.1.9 assertthat_0.2.1 Matrix_1.2-18
[31] lazyeval_0.2.2 cli_2.0.2 tweenr_1.0.1
[34] htmltools_0.5.0 tools_4.0.2 gtable_0.3.0
[37] glue_1.4.2 Rcpp_1.0.5 carData_3.0-4
[40] cellranger_1.1.0 vctrs_0.3.4 nlme_3.1-149
[43] crosstalk_1.1.0.1 psych_2.0.7 xfun_0.16
[46] openxlsx_4.1.5 rvest_0.3.6 lifecycle_0.2.0
[49] mosaicCore_0.6.0 MASS_7.3-52 scales_1.1.1
[52] hms_0.5.3 parallel_4.0.2 RColorBrewer_1.1-2
[55] yaml_2.2.1 curl_4.3 mosaicData_0.18.0
[58] gridExtra_2.3 rpart_4.1-15 latticeExtra_0.6-29
[61] stringi_1.4.6 highr_0.8 checkmate_2.0.0
[64] zip_2.1.1 rlang_0.4.7 pkgconfig_2.0.3
[67] evaluate_0.14 lattice_0.20-41 htmlwidgets_1.5.1
[70] labeling_0.3 tidyselect_1.1.0 ggformula_0.9.4
[73] bookdown_0.20 R6_2.4.1 generics_0.0.2
[ reached getOption("max.print") -- omitted 24 entries ]