# read in data and save it as an object called dat
# (can also use read.csv, but I like this function from the tidyverse package better)
<- read_csv("simulated-data.csv") dat
## Rows: 10000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): id, time_ended, time_potentially_exposed, time_exposed
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# look at the first 6 rows
head(dat)
## # A tibble: 6 × 4
## id time_ended time_potentially_exposed time_exposed
## <dbl> <dbl> <dbl> <dbl>
## 1 1 38.3 20.1 20.1
## 2 2 39.4 30.6 30.6
## 3 3 39 44.7 NA
## 4 4 16.1 29.6 NA
## 5 5 38.1 38 38
## 6 6 37.7 27 27
# another way of looking at the data (helpful with lots of variables)
# (this one is from one of the tidyverse packages)
glimpse(dat)
## Rows: 10,000
## Columns: 4
## $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
## $ time_ended <dbl> 38.28571, 39.42857, 39.00000, 16.14286, 38.14…
## $ time_potentially_exposed <dbl> 20.142857, 30.571429, 44.714286, 29.571429, 3…
## $ time_exposed <dbl> 20.142857, 30.571429, NA, NA, 38.000000, 27.0…
# summary statistics on the data
summary(dat)
## id time_ended time_potentially_exposed time_exposed
## Min. : 1 Min. : 5.143 Min. : 5.143 Min. : 5.143
## 1st Qu.: 2501 1st Qu.:18.714 1st Qu.:15.000 1st Qu.:12.286
## Median : 5000 Median :38.429 Median :25.000 Median :21.000
## Mean : 5000 Mean :31.465 Mean :25.059 Mean :21.399
## 3rd Qu.: 7500 3rd Qu.:40.143 3rd Qu.:35.143 3rd Qu.:30.000
## Max. :10000 Max. :45.000 Max. :45.000 Max. :43.857
## NA's :3399
We can plot the “cumulative incidence” of pregnancies ending using
the stat_ecdf()
function from ggplot. ecdf
stands for “empirical cumulative distribution function” and so will plot
the cumulative proportion of pregnancies that are completed (the
time_ended
variable) by each time point.
ggplot(dat) +
stat_ecdf(aes(time_ended)) +
scale_y_continuous(labels = scales::percent) + # y axis as percentage
coord_cartesian(xlim = c(5, 45), expand = FALSE) + # x axis limits
labs(
x = "Gestational age at end of pregnancy (weeks)",
y = "Cumulative incidence pregnancies completed"
)
We can compute the values from the figure by finding the proportion
(mean()
) of observations with time_ended
less
than a certain value:
summarise(dat, sab = mean(time_ended < 20))
## # A tibble: 1 × 1
## sab
## <dbl>
## 1 0.258
Note: You may know another way to compute these quantities… there are many ways to do the same thing in R, and some of them are easier (less code) than the ones I’m showing you. I’m including code that I have found is easiest to read/most intuitive for new R users. If you want to try out other ways you know, that’s great too!
We may want to look at the distribution of gestational age at delivery among those pregnancies surviving past 20 weeks (for example).
# create a new object with all the pregnancies whose time_ended is past 20
<- filter(dat, time_ended >= 20)
deliveries
ggplot(deliveries) +
stat_ecdf(aes(time_ended)) +
scale_y_continuous(labels = scales::percent) +
coord_cartesian(xlim = c(20, 45), expand = FALSE) + # change limits
labs(
x = "Gestational age at delivery (weeks)",
y = "Cumulative pregnancies completed"
)
And compute the proportion of preterm deliveries:
summarise(deliveries, preterm = mean(time_ended < 37))
## # A tibble: 1 × 1
## preterm
## <dbl>
## 1 0.122
Try adding another summary statistic, very_preterm
, to
compute how many deliveries occur before 32 weeks.
One “naive” approach would just define participants as exposed vs. unexposed if they were infected during pregnancy while still at risk of spontaneous abortion. We’ll create that variable as well as an outcome variable.
<- dat %>%
dat # exposure time is missing (NA) if it didn't happen during pregnancy, so turn that into a 0/1 variable (if it happened while at risk of SAB)
mutate(exposed_while_pregnant = as.numeric(!is.na(time_exposed) & time_exposed < 20),
# we also want the week as a whole number, not a fraction
week_exposed = floor(time_exposed),
sab = as.numeric(time_ended < 20))
We can look at the 2-by-2 table and the naive risk ratio:
sab_2x2(dat)
COVID during pregnancy | Total | ||
---|---|---|---|
Yes | No | ||
Spontaneous abortion | |||
Yes | 332 (11%) | 2,252 (33%) | 2,584 (26%) |
No | 2,760 (89%) | 4,656 (67%) | 7,416 (74%) |
Total | 3,092 (100%) | 6,908 (100%) | 10,000 (100%) |
# "risk ratio" = Pr(Y = 1 | A = 1) / Pr(Y = 1 | A = 0)
mean(dat$sab[dat$exposed_while_pregnant == 1]) / mean(dat$sab[dat$exposed_while_pregnant == 0])
## [1] 0.3293689
This approach is of course hugely biased.
We can allocate person-time and events based on exposure, in which all time after infection is considered exposed:
# collect all the unexposed person-time and events
<- dat %>%
dat_unexposed mutate(
event = case_when(
== 0 & sab == 1 ~ 1,
exposed_while_pregnant TRUE ~ 0
),# everyone starts unexposed at 5
time_in = 5,
# stop time is either end of pregnancy (if < 20), if event happened when unexposed
# or censoring time if exposed before then
time_out = case_when(
== 1 ~ time_ended,
event == 1 ~ time_exposed,
exposed_while_pregnant TRUE ~ 20
),exposure = 0
)
# then the same for exposed
<- dat %>%
dat_exposed filter(exposed_while_pregnant == 1) %>%
mutate(
# since you can't change from exposed to unexposed, we know
# those exposed while pregnant had exposed events
event = sab,
# the exposed start time is when soneone is expoesd
time_in = time_exposed,
# and always ends at the end of pregnancy/20 weeks
time_out = case_when(
== 1 ~ time_ended,
event TRUE ~ 20
),exposure = 1
)
<- bind_rows(dat_unexposed, dat_exposed) %>%
long_dat arrange(id) %>%
select(id, exposure, time_in, time_out, event)
# see what it looks like in a few people
%>%
long_dat filter(id %in% ids_of_interest)
## # A tibble: 6 × 5
## id exposure time_in time_out event
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 712 0 5 12.6 1
## 2 4603 0 5 12.7 0
## 3 4603 1 12.7 20 0
## 4 8527 0 5 12 0
## 5 8527 1 12 20 0
## 6 9493 0 5 15.6 1
WE could just compute a rate ratio from events/person-time within each exposure category, or we could fit a Cox model:
coxph(Surv(time_in, time_out, event) ~ exposure, id = id, data = long_dat) %>%
tbl_regression(exponentiate = TRUE)
Characteristic | HR1 | 95% CI1 | p-value |
---|---|---|---|
exposure | 1.01 | 0.90, 1.14 | 0.8 |
1 HR = Hazard Ratio, CI = Confidence Interval |
This works well in this setting – under the null, with no confounding, but is difficult to interpret and drawbacks to the use to hazard ratios for causal quantities.
We might want to consider week-specific risks instead. We can calculate, for example, risk of spontaneous abortion after exposure at 6 weeks: $(Y = 1 X = 6). Compute the risk after exposure at some other weeks of your choice.
# subset just to the people exposed anytime in week 6
# recall that week_exposed is floor(time_exposed)
<- dat %>%
week_6_exposed filter(week_exposed == 6)
# Pr(Y = 1 | X = 6)
mean(week_6_exposed$sab)
## [1] 0.2155172
However, we can’t directly compare these numbers because each risk is conditional on still being pregnant up to that point.
We can see how different they could be. The risk is where each week-specific incidence curve hits the dashed line at 20 weeks.
%>%
dat filter(week_exposed %in% c(6, 11, 16, 21)) %>%
ggplot(aes(color = factor(week_exposed))) +
geom_vline(xintercept = 20, linetype = "dashed", color = "darkgrey") +
stat_ecdf(aes(time_ended)) +
geom_segment(aes(y = 0, yend = 0, x = 5, xend = 20), color = "grey92") +
coord_cartesian(expand = FALSE) +
labs(y = "Cumulative pregnancies completed", x = "Gestational age", color = "Week exposed")
We’ll prepare the data for a single trial (with a single time zero: week 12) first. We’ll clone the data to that each observation has a row for every treatment strategy (get exposed this week vs. don’t). Then we’ll remove, or censor, the data that doesn’t match that treatment strategy.
Even if we don’t have censoring due to loss to follow-up, we have to
use time-to-event methods that can take our artificial censoring into
account. We put the data into a format in which time_in
is
the time an observation starts contributing data (time zero,
randomization) and time_out
is either the censoring or
event time. The event
variable tells us whether the
observation was censored (0), or the pregnancy ended (1). Recall that
observations are censored because they are exposed when assigned not to
be or fail to get exposed by the end of the grace period when they were
assigned to be. If an event occurs during the grace period, it counts
towards both exposure groups.
# clone every observation twice (exposed + unexposed) for every time zero
<- crossing(dat, time_zero = 12,
expanded_data_12 exposed = c(0, 1)) %>%
# but don't include if not eligible:
filter(!(
<= time_zero) | # pregnancy ended before time zero
(time_ended !is.na(time_exposed) & time_exposed < time_zero) # exposed before the time zero
(
))# look at some ids
filter(expanded_data_12, id %in% ids_of_interest)
## # A tibble: 8 × 9
## id time_ended time_potentially_… time_exposed exposed_while_p… week_exposed
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 712 12.6 22 NA 0 NA
## 2 712 12.6 22 NA 0 NA
## 3 4603 38.9 12.7 12.7 1 12
## 4 4603 38.9 12.7 12.7 1 12
## 5 8527 39.9 12 12 1 12
## 6 8527 39.9 12 12 1 12
## 7 9493 15.6 36.7 NA 0 NA
## 8 9493 15.6 36.7 NA 0 NA
## # … with 3 more variables: sab <dbl>, time_zero <dbl>, exposed <dbl>
<- expanded_data_12 %>%
expanded_data_censored_12 mutate(time_in = 12,
# event or censoring time
time_out = case_when(
# never exposed, but event during grace period (counts for both)
is.na(time_exposed) & time_ended < time_in + 1 ~ time_ended,
# assigned to exposed, but don't do it before grace period ends
== 1 & is.na(time_exposed) ~ time_in + 1,
exposed == 1 & time_exposed >= time_in + 1 ~ time_in + 1,
exposed # assigned to unexposed, but later get exposed
== 0 & !is.na(time_exposed) ~ time_exposed,
exposed # everyone else
TRUE ~ time_ended
),event = case_when(
< time_ended ~ 0,
time_out TRUE ~ 1
)%>%
) # exclude those censored immediately (time in = time out)
filter(time_in < time_out)
%>%
expanded_data_censored_12 filter(id %in% ids_of_interest)
## # A tibble: 7 × 12
## id time_ended time_potentially_… time_exposed exposed_while_p… week_exposed
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 712 12.6 22 NA 0 NA
## 2 712 12.6 22 NA 0 NA
## 3 4603 38.9 12.7 12.7 1 12
## 4 4603 38.9 12.7 12.7 1 12
## 5 8527 39.9 12 12 1 12
## 6 9493 15.6 36.7 NA 0 NA
## 7 9493 15.6 36.7 NA 0 NA
## # … with 6 more variables: sab <dbl>, time_zero <dbl>, exposed <dbl>,
## # time_in <dbl>, time_out <dbl>, event <dbl>
Now we can estimate risks using a Kaplan-Meier estimator, since it can handle censoring.
<- survfit(Surv(time_in, time_out, event) ~ exposed,
km_12 data = expanded_data_censored_12)
# get survival/risk estimates
<- get_km_summary(km_12, times = unique(km_12$time))
km_12_to_plot
# plot as cumulative incidence curve
%>%
km_12_to_plot mutate(exposed = factor(exposed, labels = c("no", "yes"))) %>%
ggplot() +
geom_vline(xintercept = 20, linetype = "dashed", color = "darkgrey") +
geom_path(aes(time, risk, color = exposed)) +
geom_segment(aes(y = 0, yend = 0, x = 5, xend = 21), color = "grey92") +
labs(y = "Cumulative pregnancies completed", x = "Gestational age") +
theme(legend.position = "bottom")
# the risk at 20 weeks we'll define as the risk of SAB
get_km_summary(km_12, times = 20) %>%
select(`COVID at week 12` = exposed, `Risk of SAB` = risk)
## # A tibble: 2 × 2
## `COVID at week 12` `Risk of SAB`
## <dbl> <dbl>
## 1 0 0.0906
## 2 1 0.0805
Now we can apply this to every time zero we wish, as a separate trial. Just like the fact that observations weren’t really assigned exposures, so we don’t know what they were assigned assign them to both, we’ll also have everyone participate in all the trials. Of course, they can only participant in trials before they are exposed or their pregnancy has ended.
# clone every observation twice (exposed + unexposed) for every time zero
<- crossing(dat, time_zero = 5:19,
expanded_data exposed = c(0, 1)) %>%
# but don't include if not eligible:
filter(!(
<= time_zero) | # pregnancy ended before time zero
(time_ended !is.na(time_exposed) & time_exposed < time_zero) # exposed before the time zero
(
))
<- expanded_data %>%
expanded_data_censored mutate(time_in = time_zero,
# event or censoring time
time_out = case_when(
# never exposed, but event during grace period (counts for both)
is.na(time_exposed) & time_ended < time_in + 1 ~ time_ended,
# assigned to exposed, but don't do it before grace period ends
== 1 & is.na(time_exposed) ~ time_in + 1,
exposed == 1 & time_exposed >= time_in + 1 ~ time_in + 1,
exposed # assigned to unexposed, but later get exposed
== 0 & !is.na(time_exposed) ~ time_exposed,
exposed # everyone else
TRUE ~ time_ended
),event = case_when(
< time_ended ~ 0,
time_out TRUE ~ 1
)%>%
) # exclude those censored immediately (time in = time out)
filter(time_in < time_out)
Now we can use this in, e.g., a Kaplan-Meier model:
# we want to fit a survival curve for every time zero, so we tratify both on exposure and time zero
<- survfit(Surv(time_in, time_out, event) ~ exposed + time_zero,
km_mod data = expanded_data_censored)
# get survival/risk estimates at every possible time for each stratum
<- get_km_summary(km_mod, times = unique(km_mod$time), numeric_strata = TRUE)
km_mod_to_plot km_mod_to_plot
## # A tibble: 8,400 × 8
## time n.risk surv conf.lower conf.upper risk exposed time_zero
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 5.14 10000 0.994 0.993 0.996 0.00570 0 5
## 2 5.29 9919 0.989 0.987 0.991 0.0112 0 5
## 3 5.43 9828 0.982 0.979 0.984 0.0185 0 5
## 4 5.57 9716 0.975 0.972 0.978 0.0247 0 5
## 5 5.71 9625 0.969 0.966 0.973 0.0307 0 5
## 6 5.86 9525 0.963 0.959 0.966 0.0373 0 5
## 7 6 9420 0.956 0.952 0.960 0.0435 0 5
## 8 6.14 9317 0.950 0.945 0.954 0.0502 0 5
## 9 6.29 9222 0.944 0.940 0.949 0.0557 0 5
## 10 6.43 9144 0.941 0.936 0.945 0.0594 0 5
## # … with 8,390 more rows
And plot the cumulative incidence curves for each stratum. Note that there is a pair of exposed/unexposed curves for each time zero week.
%>%
km_mod_to_plot mutate(group = fct_cross(as.character(time_zero), as.character(exposed)),
exposed = factor(exposed, labels = c("no", "yes"))) %>%
ggplot() +
geom_vline(xintercept = 20, linetype = "dashed", color = "darkgrey") +
geom_path(aes(time, risk, group = group, color = exposed)) +
geom_segment(aes(y = 0, yend = 0, x = 5, xend = 21), color = "grey92") +
labs(y = "Cumulative pregnancies completed", x = "Gestational age")
If we want to estimate the risks of spontaneous abortion, we could look at the cumulative incidence curves at 20 weeks, and then plot them over time. We see the risk decreasing monotonically over pregnancy for the unexposed group, as expected (since it is made up of the same people). The risk for exposure bounces around a little more, since it is estimated from different people every week, but still follows the same pattern. (Recall that these data were generated under the null.)
%>%
km_mod_to_plot filter(time == 20) %>%
# flip the confidence intervals for the risk, not the survival
mutate(conf.lower.tmp = 1 - conf.upper,
conf.upper = 1 - conf.lower,
conf.lower = conf.lower.tmp,
week = time_zero,
exposed = factor(exposed, labels = c("no", "yes"))) %>%
ggplot(aes(exposed, risk, color = exposed)) +
geom_point() +
geom_errorbar(aes(ymin = conf.lower, ymax = conf.upper)) +
facet_grid(switch = "x", cols = vars(week), labeller = label_both) +
labs(y = "Risk of spontaneous abortion", x = NULL) +
theme(legend.position = "bottom", axis.text.x = element_blank())
Instead of estimating a separate survival curve for each exposure/time zero stratum, we might want to fit a model that assumes some structure about the effect of exposure across different weeks of gestation. For example, we might fit a Cox model and assume that the effect of exposure changes linearly (on the log(hazard ratio) scale) across gestation:
# really just need the exposed and exposed:time_zero terms, but it can cause problems in the next step without both main effects
<- coxph(Surv(time_in, time_out, event) ~ exposed*time_zero,
cox_mod data = expanded_data_censored, id = id) #id argument gives robust se's
tbl_regression(cox_mod, exponentiate = TRUE)
Characteristic | HR1 | 95% CI1 | p-value |
---|---|---|---|
exposed | 0.94 | 0.88, 1.00 | 0.058 |
time_zero | 1.00 | 1.00, 1.00 | 0.6 |
exposed * time_zero | 1.01 | 1.00, 1.01 | 0.055 |
1 HR = Hazard Ratio, CI = Confidence Interval |
Of course, under the null, there is neither an effect of exposure, nor does it change over time, so we would expect all the coefficients to be close to 1 on the hazard ratio scale. If we want to report hazard ratios, we could compute them for every time zero week (or several representative weeks):
# this is a really great package for computing combinations of coefficients, interaction terms, etc
# remotes::install_github("vincentarelbundock/marginaleffects")
library(marginaleffects)
comparisons(cox_mod, newdata = crossing(exposed = 0, time_zero = 5:19),
variables = "exposed", type = "lp", transform_post = exp) %>%
select(time_zero, `HR: exposed vs. unexposed` = comparison, conf.low, conf.high)
## time_zero HR: exposed vs. unexposed conf.low conf.high
## 1 5 0.9691787 0.9287760 1.011339
## 2 6 0.9747962 0.9364693 1.014692
## 3 7 0.9804463 0.9435151 1.018823
## 4 8 0.9861291 0.9497997 1.023848
## 5 9 0.9918449 0.9552510 1.029841
## 6 10 0.9975938 0.9598580 1.036813
## 7 11 1.0033761 0.9636715 1.044717
## 8 12 1.0091918 0.9667854 1.053458
## 9 13 1.0150413 0.9693117 1.062928
## 10 14 1.0209246 0.9713600 1.073018
## 11 15 1.0268421 0.9730264 1.083634
## 12 16 1.0327939 0.9743899 1.094699
## 13 17 1.0387801 0.9755131 1.106150
## 14 18 1.0448011 0.9764449 1.117943
## 15 19 1.0508569 0.9772232 1.130039
Although the Cox model does not require estimating the baseline hazard, we can estimate it and combine it with the model coefficients to produce absolute risk estimates. First we have to make a dataframe with all the combinations of coefficients we want to estimate the risk for:
<- crossing(exposed = c(0, 1), time_zero = 5:19) %>%
cox_to_plot mutate(time_in = time_zero, time_out = 20, event = NA)
cox_to_plot
## # A tibble: 30 × 5
## exposed time_zero time_in time_out event
## <dbl> <int> <int> <dbl> <lgl>
## 1 0 5 5 20 NA
## 2 0 6 6 20 NA
## 3 0 7 7 20 NA
## 4 0 8 8 20 NA
## 5 0 9 9 20 NA
## 6 0 10 10 20 NA
## 7 0 11 11 20 NA
## 8 0 12 12 20 NA
## 9 0 13 13 20 NA
## 10 0 14 14 20 NA
## # … with 20 more rows
<- predict(cox_mod, newdata = cox_to_plot,
cox_pred type = "expected", se.fit = TRUE)
# this predict function gives us hazards, which we can transform into survival
$surv <- exp(-cox_pred$fit)
cox_to_plot$conf.lower <- exp(-(cox_pred$fit + qnorm(.025)*cox_pred$se.fit))
cox_to_plot$conf.upper <- exp(-(cox_pred$fit + qnorm(.975)*cox_pred$se.fit))
cox_to_plot
%>%
cox_to_plot mutate(risk = 1 - surv,
conf.lower.tmp = 1 - conf.upper,
conf.upper = 1 - conf.lower,
conf.lower = conf.lower.tmp,
exposed = factor(exposed, labels = c("no", "yes")),
week = time_zero) %>%
ggplot(aes(exposed, risk, color = exposed)) +
geom_point() +
geom_errorbar(aes(ymin = conf.lower, ymax = conf.upper)) +
facet_grid(switch = "x", cols = vars(week), labeller = label_both) +
labs(y = "Risk of spontaneous abortion", x = NULL) +
theme(legend.position = "bottom", axis.text.x = element_blank())
We could also fit a similar model using pooled logistic regression. Instead of a 0/1 outcome for each individual as in a usual logistic regression, we have a 0/1 outcome for each week that each individual was still pregnant, indicating whether or not their pregnancy ended that week.
First we have to again expand the data, now for every week from time zero to the end of pregnancy or week of censoring. (Of course, we could do the same for days, months, etc., depending on the granularity of the data.)
# this will take a few seconds since we're making a huge (long) dataset!
<- expanded_data_censored %>%
expanded_data_censored_weekly # make a row for every possible week
crossing(week = 5:45) %>%
# then only keep those weeks in between time_in and time_out
filter(week >= time_in & week <= ceiling(time_out)) %>%
group_by(id, exposed, time_zero) %>%
# only assign event to the last row (when it actually happened)
mutate(event = case_when(
row_number() == n() ~ event,
TRUE ~ 0
%>%
)) ungroup()
Then we can fit a model. Unlike the Cox model, we have to specify the functional form of the baseline hazard. Here I’m using natural cubic splines with 4 degrees of freedom to model how the baseline hazard changes over time. Other flexible functions are certainly possible.
<- glm(event ~ splines::ns(week, df = 4) + exposed + exposed:time_zero,
glm_mod data = expanded_data_censored_weekly,
family = binomial())
# fitting the confidence intervals with robust standard errors
# however, we are not necessarily going to be interpreting the parameters from this model
# (though they are discrete-time hazard ratios)
tbl_regression(glm_mod, exponentiate = TRUE, include = -contains("splines"),
tidy_fun = partial(tidy_robust, vcov_type = "HC3", ci_method = "wald"))
Characteristic | OR1 | 95% CI1 | p-value |
---|---|---|---|
exposed | 1.00 | 0.91, 1.10 | >0.9 |
exposed * time_zero | 0.98 | 0.98, 0.99 | <0.001 |
1 OR = Odds Ratio, CI = Confidence Interval |
# (there will be a warning -- it's due to the package, not you!)
To get risk estimates, as with the Cox model we can make a dataframe with every combination of variables we want an estimate for. We can then predict the hazard (probability of delivery within the next week, given that someone is still pregnant) at every possible week as a combination of the coefficients, and then use those hazards to estimate the survival and risk at every week.
<- crossing(exposed = c(0, 1), time_zero = 5:19, week = 5:20) %>%
glm_to_plot filter(week >= time_zero)
$hazard <- predict(glm_mod, newdata = glm_to_plot, type = "response")
glm_to_plot
%>%
glm_to_plot group_by(exposed, time_zero) %>%
mutate(int_surv = 1 - hazard,
surv = cumprod(int_surv),
risk = 1 - surv) %>%
ungroup() %>%
mutate(exposed = factor(exposed, labels = c("no", "yes"))) %>%
filter(week == 20) %>%
mutate(week = time_zero) %>%
ggplot(aes(exposed, risk, color = exposed)) +
geom_point() +
coord_cartesian(ylim = c(0, .3)) +
facet_grid(switch = "x", cols = vars(week), labeller = label_both) +
labs(y = "Risk of spontaneous abortion", x = NULL) +
theme(legend.position = "none", axis.text.x = element_blank())
It would be easiest to bootstrap this model to estimate confidence intervals for those estimates.
First we’ll read in some confounded data.
<- read_csv("simulated-data-confounded.csv") dat_confounded
Compare the stratum-specific pattern of pregnancy lengths to the
overall distribution, which is the same as the data above. To change the
plot to see that overall distribution, remove
color = confounder
, which is what is causing the figure to
be stratified.
ggplot(dat_confounded, aes(color = confounder)) +
stat_ecdf(aes(time_ended)) +
scale_y_continuous(labels = scales::percent) +
coord_cartesian(xlim = c(5, 45), expand = FALSE) +
labs(
x = "Gestational age at end of pregnancy (weeks)",
y = "Cumulative incidence pregnancies completed"
)
We can do the same with the cumulative incidence of exposure:
ggplot(dat_confounded, aes(color = confounder)) +
stat_ecdf(aes(time_potentially_exposed)) +
scale_y_continuous(labels = scales::percent) +
coord_cartesian(xlim = c(5, 45), expand = FALSE) +
labs(
x = "Gestational age at exposure",
y = "Cumulative incidence of exposure"
)
We repeat the process of cloning and censoring the data, and estimate stratified survival curves. When we estimate the risks of spontaneous abortion from those curves, the pregnancies that were exposed in the first trimester or so appear to be at higher risk.
# clone every observation twice (exposed + unexposed) for every time zero
<- crossing(dat_confounded, time_zero = 5:19,
expanded_data_confounded exposed = c(0, 1)) %>%
# but don't include if not eligible:
filter(!(
<= time_zero) | # pregnancy ended before time zero
(time_ended !is.na(time_exposed) & time_exposed < time_zero) # exposed before the time zero
(
))
<- expanded_data_confounded %>%
expanded_data_censored_confounded mutate(time_in = time_zero,
# event or censoring time
time_out = case_when(
# never exposed, but event during grace period (counts for both)
is.na(time_exposed) & time_ended < time_in + 1 ~ time_ended,
# assigned to exposed, but don't do it before grace period ends
== 1 & is.na(time_exposed) ~ time_in + 1,
exposed == 1 & time_exposed >= time_in + 1 ~ time_in + 1,
exposed # assigned to unexposed, but later get exposed
== 0 & !is.na(time_exposed) ~ time_exposed,
exposed # everyone else
TRUE ~ time_ended
),event = case_when(
< time_ended ~ 0,
time_out TRUE ~ 1
)%>%
) # exclude those censored immediately (time in = time out)
filter(time_in < time_out)
<- survfit(Surv(time_in, time_out, event) ~ exposed + time_zero,
km_mod_unadjusted data = expanded_data_censored_confounded)
# get survival/risk estimates at every possible time for each stratum
<- get_km_summary(km_mod_unadjusted, times = unique(km_mod_unadjusted$time), numeric_strata = TRUE)
km_mod_unadjusted_to_plot
%>%
km_mod_unadjusted_to_plot filter(time == 20) %>%
# flip the confidence intervals for the risk, not the survival
mutate(conf.lower.tmp = 1 - conf.upper,
conf.upper = 1 - conf.lower,
conf.lower = conf.lower.tmp,
week = time_zero,
exposed = factor(exposed, labels = c("no", "yes"))) %>%
ggplot(aes(exposed, risk, color = exposed)) +
geom_point() +
geom_errorbar(aes(ymin = conf.lower, ymax = conf.upper)) +
facet_grid(switch = "x", cols = vars(week), labeller = label_both) +
labs(y = "Risk of spontaneous abortion", x = NULL) +
theme(legend.position = "bottom", axis.text.x = element_blank())
If we fit a Cox model using the data, we can see that the hazard ratios are above 1 for the first trimester.
<- coxph(Surv(time_in, time_out, event) ~ exposed*time_zero,
cox_mod_unadjusted data = expanded_data_censored_confounded, id = id)
::tidy(cox_mod_unadjusted, exponentiate = TRUE, conf.int = TRUE) broom
## # A tibble: 3 × 8
## term estimate std.error robust.se statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 exposed 1.08 0.0425 0.0312 2.51 0.0120 1.02 1.15
## 2 time_zero 1.00 0.00149 0.000331 0.659 0.510 1.00 1.00
## 3 exposed:tim… 0.997 0.00359 0.00305 -1.11 0.266 0.991 1.00
# this next figure requires the marginaleffects package to use the comparisons function, as above
comparisons(cox_mod_unadjusted, newdata = crossing(exposed = 0, time_zero = 5:19),
variables = "exposed", type = "lp", transform_post = exp) %>%
select(time_zero, HR = comparison, conf.low, conf.high) %>%
ggplot(aes(x = time_zero, y = HR, ymin = conf.low, ymax = conf.high)) +
geom_hline(yintercept = 1, linetype = "dashed", color = "darkgrey") +
geom_point() +
geom_errorbar() +
labs(y = "Hazard ratio", x = "Week of infection")
We need to adjust for confounding. As with any analysis, there are a number of ways we could do so.
We could stratify the Kaplan-Meier survival curves by the confounders, and use those to estimate stratified risks:
<- survfit(Surv(time_in, time_out, event) ~ exposed + time_zero + confounder,
km_mod_adjusted data = expanded_data_censored_confounded)
<- get_km_summary(km_mod_adjusted, times = 20)
km_mod_adjusted_to_plot %>%
km_mod_adjusted_to_plot mutate(conf.lower.tmp = 1 - conf.upper,
conf.upper = 1 - conf.lower,
conf.lower = conf.lower.tmp,
week = parse_number(time_zero),
exposed = factor(exposed, labels = c("no", "yes"))) %>%
ggplot(aes(exposed, risk, color = exposed)) +
geom_point() +
geom_errorbar(aes(ymin = conf.lower, ymax = conf.upper)) +
facet_grid(switch = "x", cols = vars(week), rows = vars(confounder), labeller = label_both) +
labs(y = "Risk of spontaneous abortion", x = NULL) +
theme(legend.position = "none", axis.text.x = element_blank())
Of course, this only works if there are a small number of categorical confounders. Even with our large dataset, confidence intervals could be large. Unless we have reason to believe that the magnitude of the effect differs by a confounder (i.e., it is also an effect modifier), we might want to average over values of the confounder.
<- coxph(Surv(time_in, time_out, event) ~ exposed*time_zero + strata(confounder),
cox_mod_adjusted data = expanded_data_censored_confounded, id = id)
::tidy(cox_mod_adjusted, exponentiate = TRUE, conf.int = TRUE) broom
## # A tibble: 3 × 8
## term estimate std.error robust.se statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 exposed 0.986 0.0434 0.0387 -0.376 0.707 0.913 1.06
## 2 time_zero 1.00 0.00149 0.000388 -0.0810 0.935 0.999 1.00
## 3 exposed:tim… 1.00 0.00364 0.00342 0.227 0.820 0.994 1.01
# this next figure requires the marginaleffects package to use the comparisons function, as above
# the code is a bit complicated because we have to do it week-by-week
map_dfr(5:19, ~{
comparisons(cox_mod_adjusted,
newdata = filter(expanded_data_censored_confounded, exposed == 0, time_zero == .x),
variables = "exposed", type = "lp") %>%
summary(transform_avg = exp) %>%
mutate(time_zero = .x)
%>%
}) ggplot(aes(x = time_zero, y = estimate, ymin = conf.low, ymax = conf.high)) +
geom_hline(yintercept = 1, linetype = "dashed", color = "darkgrey") +
geom_point() +
geom_errorbar() +
scale_y_log10() +
labs(y = "Hazard ratio", x = "Week of infection")
We could also standardize the risk estimates from the Cox or a pooled logistic regression model. Here we’ll standardize to the distribution of covariates (well, the single one) in the whole population at baseline.
<- expanded_data_censored_confounded %>%
expanded_data_censored_confounded_weekly # make a row for every possible week
crossing(week = 5:45) %>%
# then only keep those weeks in between time_in and time_out
filter(week >= time_in & week <= ceiling(time_out)) %>%
group_by(id, time_zero, exposed) %>%
# only assign event to the last row (when it actually happened)
mutate(event = case_when(
row_number() == n() ~ event,
TRUE ~ 0
%>%
)) ungroup()
# this will take some time to run... it's a really big dataset!
<- glm(event ~ splines::ns(week, 4)*confounder + exposed + exposed:time_zero,
glm_mod_adjusted data = expanded_data_censored_confounded_weekly,
family = binomial())
# expand the dataset so every person has a row for every week and every time zero
<- dat_confounded %>%
glm_to_plot_adjusted crossing(exposed = c(0, 1), week = 5:20, time_zero = 5:19) %>%
filter(week >= time_zero)
$hazard <- predict(glm_mod_adjusted, newdata = glm_to_plot_adjusted, type = "response")
glm_to_plot_adjusted
<- glm_to_plot_adjusted %>%
glm_risks_adjusted group_by(id, time_zero) %>%
arrange(week) %>%
mutate(int_surv = 1 - hazard,
surv = cumprod(int_surv),
risk = 1 - surv) %>%
ungroup() %>%
filter(week == 20) %>%
group_by(exposed, time_zero) %>%
summarise(risk = mean(risk), .groups = "drop") %>%
mutate(exposed = factor(exposed, labels = c("no", "yes")))
%>%
glm_risks_adjusted mutate(week = time_zero) %>%
ggplot(aes(exposed, risk, color = exposed)) +
geom_point() +
facet_grid(switch = "x", cols = vars(week), labeller = label_both) +
labs(y = "Risk of spontaneous abortion", x = NULL) +
theme(legend.position = "bottom", axis.text.x = element_blank())
We can also compute risk ratios directly from the standardized risks:
%>%
glm_risks_adjusted pivot_wider(names_from = exposed, values_from = risk, names_prefix = "risk_") %>%
mutate(risk_ratio = risk_yes / risk_no)
## # A tibble: 15 × 4
## time_zero risk_no risk_yes risk_ratio
## <int> <dbl> <dbl> <dbl>
## 1 5 0.240 0.251 1.04
## 2 6 0.212 0.222 1.05
## 3 7 0.187 0.197 1.05
## 4 8 0.165 0.174 1.05
## 5 9 0.145 0.153 1.06
## 6 10 0.126 0.134 1.07
## 7 11 0.108 0.116 1.07
## 8 12 0.0911 0.0986 1.08
## 9 13 0.0744 0.0815 1.10
## 10 14 0.0584 0.0651 1.12
## 11 15 0.0436 0.0499 1.15
## 12 16 0.0306 0.0366 1.20
## 13 17 0.0199 0.0256 1.29
## 14 18 0.0116 0.0169 1.46
## 15 19 0.00544 0.0105 1.92
Again, this process should be bootstrapped to estimate confidence intervals.
First, we turn the whole procedure into a function. We’ll have to repeat it many, many times, so if there are opportunities to speed up your code, take them! This code was written for readibility, not speed…
<- function(dat_confounded) {
get_standardized_risks
# clone every observation twice (exposed + unexposed) for every time zero
<- crossing(dat_confounded, time_zero = 5:19,
expanded_data_confounded exposed = c(0, 1)) %>%
# but don't include if not eligible:
filter(!(
<= time_zero) | # pregnancy ended before time zero
(time_ended !is.na(time_exposed) & time_exposed < time_zero) # exposed before the time zero
(
))
<- expanded_data_confounded %>%
expanded_data_censored_confounded mutate(time_in = time_zero,
# event or censoring time
time_out = case_when(
# never exposed, but event during grace period (counts for both)
is.na(time_exposed) & time_ended < time_in + 1 ~ time_ended,
# assigned to exposed, but don't do it before grace period ends
== 1 & is.na(time_exposed) ~ time_in + 1,
exposed == 1 & time_exposed >= time_in + 1 ~ time_in + 1,
exposed # assigned to unexposed, but later get exposed
== 0 & !is.na(time_exposed) ~ time_exposed,
exposed # everyone else
TRUE ~ time_ended
),event = case_when(
< time_ended ~ 0,
time_out TRUE ~ 1
)%>%
) # exclude those censored immediately (time in = time out)
filter(time_in < time_out)
<- expanded_data_censored_confounded %>%
expanded_data_censored_confounded_weekly # make a row for every possible week
crossing(week = 5:45) %>%
# then only keep those weeks in between time_in and time_out
filter(week >= time_in & week <= ceiling(time_out)) %>%
group_by(id, time_zero, exposed) %>%
# only assign event to the last row (when it actually happened)
mutate(event = case_when(
row_number() == n() ~ event,
TRUE ~ 0
%>%
)) ungroup()
<- glm(event ~ splines::ns(week, 4)*confounder + exposed + exposed:time_zero,
glm_mod_adjusted data = expanded_data_censored_confounded_weekly,
family = binomial())
<- dat_confounded %>%
glm_to_plot_adjusted crossing(exposed = c(0, 1), week = 5:20, time_zero = 5:19) %>%
filter(week >= time_zero)
$hazard <- predict(glm_mod_adjusted, newdata = glm_to_plot_adjusted, type = "response")
glm_to_plot_adjusted
<- glm_to_plot_adjusted %>%
glm_risks_adjusted group_by(id, time_zero) %>%
arrange(week) %>%
mutate(int_surv = 1 - hazard,
surv = cumprod(int_surv),
risk = 1 - surv) %>%
ungroup() %>%
filter(week == 20) %>%
group_by(exposed, time_zero) %>%
summarise(risk = mean(risk), .groups = "drop") %>%
mutate(exposed = factor(exposed, labels = c("no", "yes")))
# putting in this form for the bootstrap function
# ie columns for term and estimate
%>%
glm_risks_adjusted pivot_wider(names_from = exposed, values_from = risk, names_prefix = "risk_") %>%
mutate(risk_ratio = risk_yes / risk_no) %>%
pivot_longer(-time_zero, values_to = "estimate") %>%
unite(col = "term", name, time_zero)
}
The rsample
package makes bootstrapping easy. You can
read more about it here.
library(rsample)
# you obviously need many more bootstrap samples!
<- bootstraps(dat_confounded, times = 2)
boots
<- boots %>%
boot_models mutate(risks = map(splits, ~get_standardized_risks(analysis(.x))))
<- boot_models %>%
bootstrapped_ests # compute percentile confidence intervals
int_pctl(risks) %>%
separate(term, into = c("risk", "type", "time_zero")) %>%
mutate(time_zero = parse_number(time_zero)) %>%
select(-risk)
%>%
bootstrapped_ests filter(type %in% c("no", "yes")) %>%
mutate(week = time_zero) %>%
ggplot(aes(x = type, y = .estimate, ymin = .lower, ymax = .upper, color = type)) +
geom_point() +
geom_errorbar() +
facet_grid(switch = "x", cols = vars(week), labeller = label_both) +
labs(y = "Risk of spontaneous abortion", x = NULL, color = "exposed") +
theme(legend.position = "bottom", axis.text.x = element_blank())
%>%
bootstrapped_ests filter(type == "ratio") %>%
ggplot(aes(x = time_zero, y = .estimate, ymin = .lower, ymax = .upper)) +
geom_hline(yintercept = 1, linetype = "dashed", color = "darkgrey") +
geom_point() +
geom_errorbar() +
scale_y_log10() +
labs(x = "Week of infection", y = "Risk ratio")