Lab 1

Summarizing the data

# read in data and save it as an object called dat
# (can also use read.csv, but I like this function from the tidyverse package better)
dat <- read_csv("simulated-data.csv")
## Rows: 10000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): id, time_ended, time_potentially_exposed, time_exposed
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# look at the first 6 rows
head(dat)
## # A tibble: 6 × 4
##      id time_ended time_potentially_exposed time_exposed
##   <dbl>      <dbl>                    <dbl>        <dbl>
## 1     1       38.3                     20.1         20.1
## 2     2       39.4                     30.6         30.6
## 3     3       39                       44.7         NA  
## 4     4       16.1                     29.6         NA  
## 5     5       38.1                     38           38  
## 6     6       37.7                     27           27
# another way of looking at the data (helpful with lots of variables)
# (this one is from one of the tidyverse packages)
glimpse(dat)
## Rows: 10,000
## Columns: 4
## $ id                       <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
## $ time_ended               <dbl> 38.28571, 39.42857, 39.00000, 16.14286, 38.14…
## $ time_potentially_exposed <dbl> 20.142857, 30.571429, 44.714286, 29.571429, 3…
## $ time_exposed             <dbl> 20.142857, 30.571429, NA, NA, 38.000000, 27.0…
# summary statistics on the data
summary(dat)
##        id          time_ended     time_potentially_exposed  time_exposed   
##  Min.   :    1   Min.   : 5.143   Min.   : 5.143           Min.   : 5.143  
##  1st Qu.: 2501   1st Qu.:18.714   1st Qu.:15.000           1st Qu.:12.286  
##  Median : 5000   Median :38.429   Median :25.000           Median :21.000  
##  Mean   : 5000   Mean   :31.465   Mean   :25.059           Mean   :21.399  
##  3rd Qu.: 7500   3rd Qu.:40.143   3rd Qu.:35.143           3rd Qu.:30.000  
##  Max.   :10000   Max.   :45.000   Max.   :45.000           Max.   :43.857  
##                                                            NA's   :3399

We can plot the “cumulative incidence” of pregnancies ending using the stat_ecdf() function from ggplot. ecdf stands for “empirical cumulative distribution function” and so will plot the cumulative proportion of pregnancies that are completed (the time_ended variable) by each time point.

ggplot(dat) +
  stat_ecdf(aes(time_ended)) +
  scale_y_continuous(labels = scales::percent) + # y axis as percentage
  coord_cartesian(xlim = c(5, 45), expand = FALSE) + # x axis limits
  labs(
    x = "Gestational age at end of pregnancy (weeks)",
    y = "Cumulative incidence pregnancies completed"
  )

We can compute the values from the figure by finding the proportion (mean()) of observations with time_ended less than a certain value:

summarise(dat, sab = mean(time_ended < 20))
## # A tibble: 1 × 1
##     sab
##   <dbl>
## 1 0.258

Note: You may know another way to compute these quantities… there are many ways to do the same thing in R, and some of them are easier (less code) than the ones I’m showing you. I’m including code that I have found is easiest to read/most intuitive for new R users. If you want to try out other ways you know, that’s great too!

We may want to look at the distribution of gestational age at delivery among those pregnancies surviving past 20 weeks (for example).

# create a new object with all the pregnancies whose time_ended is past 20
deliveries <- filter(dat, time_ended >= 20)

ggplot(deliveries) +
  stat_ecdf(aes(time_ended)) +
  scale_y_continuous(labels = scales::percent) +
  coord_cartesian(xlim = c(20, 45), expand = FALSE) + # change limits
  labs(
    x = "Gestational age at delivery (weeks)",
    y = "Cumulative pregnancies completed"
  )

And compute the proportion of preterm deliveries:

summarise(deliveries, preterm = mean(time_ended < 37))
## # A tibble: 1 × 1
##   preterm
##     <dbl>
## 1   0.122

Try adding another summary statistic, very_preterm, to compute how many deliveries occur before 32 weeks.

Defining new variables

One “naive” approach would just define participants as exposed vs. unexposed if they were infected during pregnancy while still at risk of spontaneous abortion. We’ll create that variable as well as an outcome variable.

dat <- dat %>%   
  # exposure time is missing (NA) if it didn't happen during pregnancy, so turn that into a 0/1 variable (if it happened while at risk of SAB)
  mutate(exposed_while_pregnant = as.numeric(!is.na(time_exposed) & time_exposed < 20), 
         # we also want the week as a whole number, not a fraction
         week_exposed = floor(time_exposed),
         sab = as.numeric(time_ended < 20))

Initial “effect” estimates

We can look at the 2-by-2 table and the naive risk ratio:

sab_2x2(dat)
COVID during pregnancy Total
Yes No
Spontaneous abortion
Yes 332 (11%) 2,252 (33%) 2,584 (26%)
No 2,760 (89%) 4,656 (67%) 7,416 (74%)
Total 3,092 (100%) 6,908 (100%) 10,000 (100%)
# "risk ratio" = Pr(Y = 1 | A = 1) / Pr(Y = 1 | A = 0)
mean(dat$sab[dat$exposed_while_pregnant == 1]) / mean(dat$sab[dat$exposed_while_pregnant == 0]) 
## [1] 0.3293689

This approach is of course hugely biased.

We can allocate person-time and events based on exposure, in which all time after infection is considered exposed:

# collect all the unexposed person-time and events
dat_unexposed <- dat %>% 
  mutate(
    event = case_when(
      exposed_while_pregnant == 0 & sab == 1 ~ 1,
      TRUE ~ 0
    ),
    # everyone starts unexposed at 5
    time_in = 5,
    # stop time is either end of pregnancy (if < 20), if event happened when unexposed
    # or censoring time if exposed before then
    time_out = case_when(
      event == 1 ~ time_ended,
      exposed_while_pregnant == 1 ~ time_exposed,
      TRUE ~ 20
    ),
    exposure = 0
  )

# then the same for exposed
dat_exposed <- dat %>% 
  filter(exposed_while_pregnant == 1) %>%
  mutate(
    # since you can't change from exposed to unexposed, we know
    # those exposed while pregnant had exposed events
    event = sab,
    # the exposed start time is when soneone is expoesd
    time_in = time_exposed,
    # and always ends at the end of pregnancy/20 weeks
    time_out = case_when(
      event == 1 ~ time_ended,
      TRUE ~ 20
    ),
    exposure = 1
  )

long_dat <- bind_rows(dat_unexposed, dat_exposed) %>% 
  arrange(id) %>% 
  select(id, exposure, time_in, time_out, event) 

# see what it looks like in a few people
long_dat %>% 
  filter(id %in% ids_of_interest)
## # A tibble: 6 × 5
##      id exposure time_in time_out event
##   <dbl>    <dbl>   <dbl>    <dbl> <dbl>
## 1   712        0     5       12.6     1
## 2  4603        0     5       12.7     0
## 3  4603        1    12.7     20       0
## 4  8527        0     5       12       0
## 5  8527        1    12       20       0
## 6  9493        0     5       15.6     1

WE could just compute a rate ratio from events/person-time within each exposure category, or we could fit a Cox model:

coxph(Surv(time_in, time_out, event) ~ exposure, id = id, data = long_dat) %>% 
  tbl_regression(exponentiate = TRUE)
Characteristic HR1 95% CI1 p-value
exposure 1.01 0.90, 1.14 0.8
1 HR = Hazard Ratio, CI = Confidence Interval

This works well in this setting – under the null, with no confounding, but is difficult to interpret and drawbacks to the use to hazard ratios for causal quantities.

We might want to consider week-specific risks instead. We can calculate, for example, risk of spontaneous abortion after exposure at 6 weeks: $(Y = 1 X = 6). Compute the risk after exposure at some other weeks of your choice.

# subset just to the people exposed anytime in week 6
# recall that week_exposed is floor(time_exposed)
week_6_exposed <- dat %>%
  filter(week_exposed == 6)

# Pr(Y = 1 | X = 6)
mean(week_6_exposed$sab)
## [1] 0.2155172

However, we can’t directly compare these numbers because each risk is conditional on still being pregnant up to that point.

We can see how different they could be. The risk is where each week-specific incidence curve hits the dashed line at 20 weeks.

dat %>%
  filter(week_exposed %in% c(6, 11, 16, 21)) %>% 
  ggplot(aes(color = factor(week_exposed))) +
  geom_vline(xintercept = 20, linetype = "dashed", color = "darkgrey") +
  stat_ecdf(aes(time_ended)) +
  geom_segment(aes(y = 0, yend = 0, x = 5, xend = 20), color = "grey92") +
  coord_cartesian(expand = FALSE) +
  labs(y = "Cumulative pregnancies completed", x = "Gestational age", color = "Week exposed")

Lab 2

Clone and censor: single trial

We’ll prepare the data for a single trial (with a single time zero: week 12) first. We’ll clone the data to that each observation has a row for every treatment strategy (get exposed this week vs. don’t). Then we’ll remove, or censor, the data that doesn’t match that treatment strategy.

Even if we don’t have censoring due to loss to follow-up, we have to use time-to-event methods that can take our artificial censoring into account. We put the data into a format in which time_in is the time an observation starts contributing data (time zero, randomization) and time_out is either the censoring or event time. The event variable tells us whether the observation was censored (0), or the pregnancy ended (1). Recall that observations are censored because they are exposed when assigned not to be or fail to get exposed by the end of the grace period when they were assigned to be. If an event occurs during the grace period, it counts towards both exposure groups.

# clone every observation twice (exposed + unexposed) for every time zero
expanded_data_12 <- crossing(dat, time_zero = 12, 
                                   exposed = c(0, 1)) %>%
  # but don't include if not eligible:
  filter(!(
    (time_ended <= time_zero) | # pregnancy ended before time zero
      (!is.na(time_exposed) & time_exposed < time_zero) # exposed before the time zero
  ))
# look at some ids
filter(expanded_data_12, id %in% ids_of_interest)
## # A tibble: 8 × 9
##      id time_ended time_potentially_… time_exposed exposed_while_p… week_exposed
##   <dbl>      <dbl>              <dbl>        <dbl>            <dbl>        <dbl>
## 1   712       12.6               22           NA                  0           NA
## 2   712       12.6               22           NA                  0           NA
## 3  4603       38.9               12.7         12.7                1           12
## 4  4603       38.9               12.7         12.7                1           12
## 5  8527       39.9               12           12                  1           12
## 6  8527       39.9               12           12                  1           12
## 7  9493       15.6               36.7         NA                  0           NA
## 8  9493       15.6               36.7         NA                  0           NA
## # … with 3 more variables: sab <dbl>, time_zero <dbl>, exposed <dbl>
expanded_data_censored_12 <- expanded_data_12 %>% 
  mutate(time_in = 12,
         # event or censoring time
         time_out = case_when(
           # never exposed, but event during grace period (counts for both)
           is.na(time_exposed) & time_ended < time_in + 1 ~ time_ended,
           # assigned to exposed, but don't do it before grace period ends
           exposed == 1 & is.na(time_exposed) ~ time_in + 1,
           exposed == 1 & time_exposed >= time_in + 1 ~ time_in + 1,
           # assigned to unexposed, but later get exposed
           exposed == 0 & !is.na(time_exposed) ~ time_exposed,
           # everyone else
           TRUE ~ time_ended
         ),
         event = case_when(
           time_out < time_ended ~ 0,
           TRUE ~ 1
         )
  ) %>% 
    # exclude those censored immediately (time in = time out)
  filter(time_in < time_out)

expanded_data_censored_12 %>% 
  filter(id %in% ids_of_interest)
## # A tibble: 7 × 12
##      id time_ended time_potentially_… time_exposed exposed_while_p… week_exposed
##   <dbl>      <dbl>              <dbl>        <dbl>            <dbl>        <dbl>
## 1   712       12.6               22           NA                  0           NA
## 2   712       12.6               22           NA                  0           NA
## 3  4603       38.9               12.7         12.7                1           12
## 4  4603       38.9               12.7         12.7                1           12
## 5  8527       39.9               12           12                  1           12
## 6  9493       15.6               36.7         NA                  0           NA
## 7  9493       15.6               36.7         NA                  0           NA
## # … with 6 more variables: sab <dbl>, time_zero <dbl>, exposed <dbl>,
## #   time_in <dbl>, time_out <dbl>, event <dbl>

Now we can estimate risks using a Kaplan-Meier estimator, since it can handle censoring.

km_12 <- survfit(Surv(time_in, time_out, event) ~ exposed,
                 data = expanded_data_censored_12)

# get survival/risk estimates
km_12_to_plot <- get_km_summary(km_12, times = unique(km_12$time))

# plot as cumulative incidence curve
km_12_to_plot %>%
  mutate(exposed = factor(exposed, labels = c("no", "yes"))) %>%
  ggplot() +
  geom_vline(xintercept = 20, linetype = "dashed", color = "darkgrey") +
  geom_path(aes(time, risk,  color = exposed)) +
  geom_segment(aes(y = 0, yend = 0, x = 5, xend = 21), color = "grey92") +
  labs(y = "Cumulative pregnancies completed", x = "Gestational age") +
  theme(legend.position = "bottom")

# the risk at 20 weeks we'll define as the risk of SAB
get_km_summary(km_12, times = 20) %>% 
  select(`COVID at week 12` = exposed, `Risk of SAB` = risk)
## # A tibble: 2 × 2
##   `COVID at week 12` `Risk of SAB`
##                <dbl>         <dbl>
## 1                  0        0.0906
## 2                  1        0.0805

Now we can apply this to every time zero we wish, as a separate trial. Just like the fact that observations weren’t really assigned exposures, so we don’t know what they were assigned assign them to both, we’ll also have everyone participate in all the trials. Of course, they can only participant in trials before they are exposed or their pregnancy has ended.

# clone every observation twice (exposed + unexposed) for every time zero
expanded_data <- crossing(dat, time_zero = 5:19, 
                                   exposed = c(0, 1)) %>%
  # but don't include if not eligible:
  filter(!(
    (time_ended <= time_zero) | # pregnancy ended before time zero
      (!is.na(time_exposed) & time_exposed < time_zero) # exposed before the time zero
  ))

expanded_data_censored <- expanded_data %>% 
  mutate(time_in = time_zero,
         # event or censoring time
         time_out = case_when(
           # never exposed, but event during grace period (counts for both)
           is.na(time_exposed) & time_ended < time_in + 1 ~ time_ended,
           # assigned to exposed, but don't do it before grace period ends
           exposed == 1 & is.na(time_exposed) ~ time_in + 1,
           exposed == 1 & time_exposed >= time_in + 1 ~ time_in + 1,
           # assigned to unexposed, but later get exposed
           exposed == 0 & !is.na(time_exposed) ~ time_exposed,
           # everyone else
           TRUE ~ time_ended
         ),
         event = case_when(
           time_out < time_ended ~ 0,
           TRUE ~ 1
         )
  ) %>% 
  # exclude those censored immediately (time in = time out)
  filter(time_in < time_out)

Now we can use this in, e.g., a Kaplan-Meier model:

# we want to fit a survival curve for every time zero, so we tratify both on exposure and time zero
km_mod <- survfit(Surv(time_in, time_out, event) ~ exposed + time_zero,
              data = expanded_data_censored)

# get survival/risk estimates at every possible time for each stratum
km_mod_to_plot <- get_km_summary(km_mod, times = unique(km_mod$time), numeric_strata = TRUE)
km_mod_to_plot
## # A tibble: 8,400 × 8
##     time n.risk  surv conf.lower conf.upper    risk exposed time_zero
##    <dbl>  <dbl> <dbl>      <dbl>      <dbl>   <dbl>   <dbl>     <dbl>
##  1  5.14  10000 0.994      0.993      0.996 0.00570       0         5
##  2  5.29   9919 0.989      0.987      0.991 0.0112        0         5
##  3  5.43   9828 0.982      0.979      0.984 0.0185        0         5
##  4  5.57   9716 0.975      0.972      0.978 0.0247        0         5
##  5  5.71   9625 0.969      0.966      0.973 0.0307        0         5
##  6  5.86   9525 0.963      0.959      0.966 0.0373        0         5
##  7  6      9420 0.956      0.952      0.960 0.0435        0         5
##  8  6.14   9317 0.950      0.945      0.954 0.0502        0         5
##  9  6.29   9222 0.944      0.940      0.949 0.0557        0         5
## 10  6.43   9144 0.941      0.936      0.945 0.0594        0         5
## # … with 8,390 more rows

And plot the cumulative incidence curves for each stratum. Note that there is a pair of exposed/unexposed curves for each time zero week.

km_mod_to_plot %>%
  mutate(group = fct_cross(as.character(time_zero), as.character(exposed)),
         exposed = factor(exposed, labels = c("no", "yes"))) %>%
  ggplot() +
  geom_vline(xintercept = 20, linetype = "dashed", color = "darkgrey") +
  geom_path(aes(time, risk, group = group, color = exposed)) +
  geom_segment(aes(y = 0, yend = 0, x = 5, xend = 21), color = "grey92") +
  labs(y = "Cumulative pregnancies completed", x = "Gestational age")

If we want to estimate the risks of spontaneous abortion, we could look at the cumulative incidence curves at 20 weeks, and then plot them over time. We see the risk decreasing monotonically over pregnancy for the unexposed group, as expected (since it is made up of the same people). The risk for exposure bounces around a little more, since it is estimated from different people every week, but still follows the same pattern. (Recall that these data were generated under the null.)

km_mod_to_plot %>%
  filter(time == 20) %>%
  # flip the confidence intervals for the risk, not the survival
  mutate(conf.lower.tmp = 1 - conf.upper,
         conf.upper = 1 - conf.lower,
         conf.lower = conf.lower.tmp,
         week = time_zero,
         exposed = factor(exposed, labels = c("no", "yes"))) %>%
  ggplot(aes(exposed, risk, color = exposed)) +
  geom_point() +
  geom_errorbar(aes(ymin = conf.lower, ymax = conf.upper)) +
  facet_grid(switch = "x", cols = vars(week), labeller = label_both) +
  labs(y = "Risk of spontaneous abortion", x = NULL) +
  theme(legend.position = "bottom", axis.text.x = element_blank())

Cox model

Instead of estimating a separate survival curve for each exposure/time zero stratum, we might want to fit a model that assumes some structure about the effect of exposure across different weeks of gestation. For example, we might fit a Cox model and assume that the effect of exposure changes linearly (on the log(hazard ratio) scale) across gestation:

# really just need the exposed and exposed:time_zero terms, but it can cause problems in the next step without both main effects
cox_mod <- coxph(Surv(time_in, time_out, event) ~ exposed*time_zero,
                    data = expanded_data_censored, id = id) #id argument gives robust se's
tbl_regression(cox_mod, exponentiate = TRUE)
Characteristic HR1 95% CI1 p-value
exposed 0.94 0.88, 1.00 0.058
time_zero 1.00 1.00, 1.00 0.6
exposed * time_zero 1.01 1.00, 1.01 0.055
1 HR = Hazard Ratio, CI = Confidence Interval

Of course, under the null, there is neither an effect of exposure, nor does it change over time, so we would expect all the coefficients to be close to 1 on the hazard ratio scale. If we want to report hazard ratios, we could compute them for every time zero week (or several representative weeks):

# this is a really great package for computing combinations of coefficients, interaction terms, etc
# remotes::install_github("vincentarelbundock/marginaleffects")
library(marginaleffects)
comparisons(cox_mod, newdata = crossing(exposed = 0, time_zero = 5:19), 
            variables = "exposed", type = "lp", transform_post = exp) %>% 
  select(time_zero, `HR: exposed vs. unexposed` = comparison, conf.low, conf.high)
##    time_zero HR: exposed vs. unexposed  conf.low conf.high
## 1          5                 0.9691787 0.9287760  1.011339
## 2          6                 0.9747962 0.9364693  1.014692
## 3          7                 0.9804463 0.9435151  1.018823
## 4          8                 0.9861291 0.9497997  1.023848
## 5          9                 0.9918449 0.9552510  1.029841
## 6         10                 0.9975938 0.9598580  1.036813
## 7         11                 1.0033761 0.9636715  1.044717
## 8         12                 1.0091918 0.9667854  1.053458
## 9         13                 1.0150413 0.9693117  1.062928
## 10        14                 1.0209246 0.9713600  1.073018
## 11        15                 1.0268421 0.9730264  1.083634
## 12        16                 1.0327939 0.9743899  1.094699
## 13        17                 1.0387801 0.9755131  1.106150
## 14        18                 1.0448011 0.9764449  1.117943
## 15        19                 1.0508569 0.9772232  1.130039

Although the Cox model does not require estimating the baseline hazard, we can estimate it and combine it with the model coefficients to produce absolute risk estimates. First we have to make a dataframe with all the combinations of coefficients we want to estimate the risk for:

cox_to_plot <- crossing(exposed = c(0, 1), time_zero = 5:19) %>%
  mutate(time_in = time_zero, time_out = 20, event = NA)
cox_to_plot
## # A tibble: 30 × 5
##    exposed time_zero time_in time_out event
##      <dbl>     <int>   <int>    <dbl> <lgl>
##  1       0         5       5       20 NA   
##  2       0         6       6       20 NA   
##  3       0         7       7       20 NA   
##  4       0         8       8       20 NA   
##  5       0         9       9       20 NA   
##  6       0        10      10       20 NA   
##  7       0        11      11       20 NA   
##  8       0        12      12       20 NA   
##  9       0        13      13       20 NA   
## 10       0        14      14       20 NA   
## # … with 20 more rows
cox_pred <- predict(cox_mod, newdata = cox_to_plot,
                    type = "expected", se.fit = TRUE)

# this predict function gives us hazards, which we can transform into survival
cox_to_plot$surv <- exp(-cox_pred$fit)
cox_to_plot$conf.lower <- exp(-(cox_pred$fit + qnorm(.025)*cox_pred$se.fit))
cox_to_plot$conf.upper <- exp(-(cox_pred$fit + qnorm(.975)*cox_pred$se.fit))

cox_to_plot %>%
  mutate(risk = 1 - surv,
         conf.lower.tmp = 1 - conf.upper,
         conf.upper = 1 - conf.lower,
         conf.lower = conf.lower.tmp,
         exposed = factor(exposed, labels = c("no", "yes")),
         week = time_zero) %>%
  ggplot(aes(exposed, risk, color = exposed)) +
  geom_point() +
  geom_errorbar(aes(ymin = conf.lower, ymax = conf.upper)) +
  facet_grid(switch = "x", cols = vars(week), labeller = label_both) +
  labs(y = "Risk of spontaneous abortion", x = NULL) +
  theme(legend.position = "bottom", axis.text.x = element_blank())

Pooled logistic regression model

We could also fit a similar model using pooled logistic regression. Instead of a 0/1 outcome for each individual as in a usual logistic regression, we have a 0/1 outcome for each week that each individual was still pregnant, indicating whether or not their pregnancy ended that week.

First we have to again expand the data, now for every week from time zero to the end of pregnancy or week of censoring. (Of course, we could do the same for days, months, etc., depending on the granularity of the data.)

# this will take a few seconds since we're making a huge (long) dataset!
expanded_data_censored_weekly <- expanded_data_censored %>%
  # make a row for every possible week
  crossing(week = 5:45) %>%
  # then only keep those weeks in between time_in and time_out
  filter(week >= time_in & week <= ceiling(time_out)) %>%
  group_by(id, exposed, time_zero) %>%
  # only assign event to the last row (when it actually happened)
  mutate(event = case_when(
    row_number() == n() ~ event,
    TRUE ~ 0
  )) %>% 
  ungroup()

Then we can fit a model. Unlike the Cox model, we have to specify the functional form of the baseline hazard. Here I’m using natural cubic splines with 4 degrees of freedom to model how the baseline hazard changes over time. Other flexible functions are certainly possible.

glm_mod <- glm(event ~ splines::ns(week, df = 4) + exposed + exposed:time_zero,
               data = expanded_data_censored_weekly,
               family = binomial())
# fitting the confidence intervals with robust standard errors
# however, we are not necessarily going to be interpreting the parameters from this model
# (though they are discrete-time hazard ratios)
tbl_regression(glm_mod, exponentiate = TRUE, include = -contains("splines"),
               tidy_fun = partial(tidy_robust, vcov_type = "HC3", ci_method = "wald"))
Characteristic OR1 95% CI1 p-value
exposed 1.00 0.91, 1.10 >0.9
exposed * time_zero 0.98 0.98, 0.99 <0.001
1 OR = Odds Ratio, CI = Confidence Interval
# (there will be a warning -- it's due to the package, not you!)

To get risk estimates, as with the Cox model we can make a dataframe with every combination of variables we want an estimate for. We can then predict the hazard (probability of delivery within the next week, given that someone is still pregnant) at every possible week as a combination of the coefficients, and then use those hazards to estimate the survival and risk at every week.

glm_to_plot <- crossing(exposed = c(0, 1), time_zero = 5:19, week = 5:20) %>%
  filter(week >= time_zero)

glm_to_plot$hazard <- predict(glm_mod, newdata = glm_to_plot, type = "response")

glm_to_plot %>%
  group_by(exposed, time_zero) %>%
  mutate(int_surv = 1 - hazard,
         surv = cumprod(int_surv),
         risk = 1 - surv) %>%
  ungroup() %>%
  mutate(exposed = factor(exposed, labels = c("no", "yes"))) %>%
  filter(week == 20) %>%
  mutate(week = time_zero) %>% 
  ggplot(aes(exposed, risk, color = exposed)) +
  geom_point() +
  coord_cartesian(ylim = c(0, .3)) +
  facet_grid(switch = "x", cols = vars(week), labeller = label_both) +
  labs(y = "Risk of spontaneous abortion", x = NULL) +
  theme(legend.position = "none", axis.text.x = element_blank())

It would be easiest to bootstrap this model to estimate confidence intervals for those estimates.

Lab 3

Summarizing the data

First we’ll read in some confounded data.

dat_confounded <- read_csv("simulated-data-confounded.csv")

Compare the stratum-specific pattern of pregnancy lengths to the overall distribution, which is the same as the data above. To change the plot to see that overall distribution, remove color = confounder, which is what is causing the figure to be stratified.

ggplot(dat_confounded, aes(color = confounder)) +
  stat_ecdf(aes(time_ended)) +
  scale_y_continuous(labels = scales::percent) +
  coord_cartesian(xlim = c(5, 45), expand = FALSE) +
  labs(
    x = "Gestational age at end of pregnancy (weeks)",
    y = "Cumulative incidence pregnancies completed"
  )

We can do the same with the cumulative incidence of exposure:

ggplot(dat_confounded, aes(color = confounder)) +
  stat_ecdf(aes(time_potentially_exposed)) +
  scale_y_continuous(labels = scales::percent) +
  coord_cartesian(xlim = c(5, 45), expand = FALSE) +
  labs(
    x = "Gestational age at exposure",
    y = "Cumulative incidence of exposure"
  )

Unadjusted analyses

We repeat the process of cloning and censoring the data, and estimate stratified survival curves. When we estimate the risks of spontaneous abortion from those curves, the pregnancies that were exposed in the first trimester or so appear to be at higher risk.

# clone every observation twice (exposed + unexposed) for every time zero
expanded_data_confounded <- crossing(dat_confounded, time_zero = 5:19, 
                                   exposed = c(0, 1)) %>%
  # but don't include if not eligible:
  filter(!(
    (time_ended <= time_zero) | # pregnancy ended before time zero
      (!is.na(time_exposed) & time_exposed < time_zero) # exposed before the time zero
  ))

expanded_data_censored_confounded <- expanded_data_confounded %>% 
  mutate(time_in = time_zero,
         # event or censoring time
         time_out = case_when(
           # never exposed, but event during grace period (counts for both)
           is.na(time_exposed) & time_ended < time_in + 1 ~ time_ended,
           # assigned to exposed, but don't do it before grace period ends
           exposed == 1 & is.na(time_exposed) ~ time_in + 1,
           exposed == 1 & time_exposed >= time_in + 1 ~ time_in + 1,
           # assigned to unexposed, but later get exposed
           exposed == 0 & !is.na(time_exposed) ~ time_exposed,
           # everyone else
           TRUE ~ time_ended
         ),
         event = case_when(
           time_out < time_ended ~ 0,
           TRUE ~ 1
         )
  ) %>% 
  # exclude those censored immediately (time in = time out)
  filter(time_in < time_out)

km_mod_unadjusted <- survfit(Surv(time_in, time_out, event) ~ exposed + time_zero,
              data = expanded_data_censored_confounded)

# get survival/risk estimates at every possible time for each stratum
km_mod_unadjusted_to_plot <- get_km_summary(km_mod_unadjusted, times = unique(km_mod_unadjusted$time), numeric_strata = TRUE)

km_mod_unadjusted_to_plot %>%
  filter(time == 20) %>%
  # flip the confidence intervals for the risk, not the survival
  mutate(conf.lower.tmp = 1 - conf.upper,
         conf.upper = 1 - conf.lower,
         conf.lower = conf.lower.tmp,
         week = time_zero,
         exposed = factor(exposed, labels = c("no", "yes"))) %>%
  ggplot(aes(exposed, risk, color = exposed)) +
  geom_point() +
  geom_errorbar(aes(ymin = conf.lower, ymax = conf.upper)) +
  facet_grid(switch = "x", cols = vars(week), labeller = label_both) +
  labs(y = "Risk of spontaneous abortion", x = NULL) +
  theme(legend.position = "bottom", axis.text.x = element_blank())

If we fit a Cox model using the data, we can see that the hazard ratios are above 1 for the first trimester.

cox_mod_unadjusted <- coxph(Surv(time_in, time_out, event) ~ exposed*time_zero,
                    data = expanded_data_censored_confounded, id = id)
broom::tidy(cox_mod_unadjusted, exponentiate = TRUE, conf.int = TRUE)
## # A tibble: 3 × 8
##   term         estimate std.error robust.se statistic p.value conf.low conf.high
##   <chr>           <dbl>     <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 exposed         1.08    0.0425   0.0312       2.51   0.0120    1.02       1.15
## 2 time_zero       1.00    0.00149  0.000331     0.659  0.510     1.00       1.00
## 3 exposed:tim…    0.997   0.00359  0.00305     -1.11   0.266     0.991      1.00
# this next figure requires the marginaleffects package to use the comparisons function, as above
comparisons(cox_mod_unadjusted, newdata = crossing(exposed = 0, time_zero = 5:19), 
            variables = "exposed", type = "lp", transform_post = exp) %>% 
  select(time_zero, HR = comparison, conf.low, conf.high) %>% 
  ggplot(aes(x = time_zero, y = HR, ymin = conf.low, ymax = conf.high)) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "darkgrey") +
  geom_point() +
  geom_errorbar() +
  labs(y = "Hazard ratio", x = "Week of infection")

We need to adjust for confounding. As with any analysis, there are a number of ways we could do so.

Stratified Kaplan-Meier curves

We could stratify the Kaplan-Meier survival curves by the confounders, and use those to estimate stratified risks:

km_mod_adjusted <- survfit(Surv(time_in, time_out, event) ~ exposed + time_zero + confounder,
              data = expanded_data_censored_confounded)

km_mod_adjusted_to_plot <- get_km_summary(km_mod_adjusted, times = 20)
km_mod_adjusted_to_plot %>%
  mutate(conf.lower.tmp = 1 - conf.upper,
         conf.upper = 1 - conf.lower,
         conf.lower = conf.lower.tmp,
         week = parse_number(time_zero),
         exposed = factor(exposed, labels = c("no", "yes"))) %>%
  ggplot(aes(exposed, risk, color = exposed)) +
  geom_point() +
  geom_errorbar(aes(ymin = conf.lower, ymax = conf.upper)) +
  facet_grid(switch = "x", cols = vars(week), rows = vars(confounder), labeller = label_both) +
  labs(y = "Risk of spontaneous abortion", x = NULL) +
  theme(legend.position = "none", axis.text.x = element_blank()) 

Of course, this only works if there are a small number of categorical confounders. Even with our large dataset, confidence intervals could be large. Unless we have reason to believe that the magnitude of the effect differs by a confounder (i.e., it is also an effect modifier), we might want to average over values of the confounder.

Stratified Cox model

cox_mod_adjusted <- coxph(Surv(time_in, time_out, event) ~ exposed*time_zero + strata(confounder),
                    data = expanded_data_censored_confounded, id = id)
broom::tidy(cox_mod_adjusted, exponentiate = TRUE, conf.int = TRUE)
## # A tibble: 3 × 8
##   term         estimate std.error robust.se statistic p.value conf.low conf.high
##   <chr>           <dbl>     <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 exposed         0.986   0.0434   0.0387     -0.376    0.707    0.913      1.06
## 2 time_zero       1.00    0.00149  0.000388   -0.0810   0.935    0.999      1.00
## 3 exposed:tim…    1.00    0.00364  0.00342     0.227    0.820    0.994      1.01
# this next figure requires the marginaleffects package to use the comparisons function, as above
# the code is a bit complicated because we have to do it week-by-week
map_dfr(5:19, ~{
  comparisons(cox_mod_adjusted, 
              newdata = filter(expanded_data_censored_confounded, exposed == 0, time_zero == .x), 
            variables = "exposed", type = "lp") %>%
    summary(transform_avg = exp) %>% 
    mutate(time_zero = .x)
}) %>% 
  ggplot(aes(x = time_zero, y = estimate, ymin = conf.low, ymax = conf.high)) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "darkgrey") +
  geom_point() +
  geom_errorbar() +
  scale_y_log10() +
  labs(y = "Hazard ratio", x = "Week of infection")

Standardized estimates from pooled logistic regression

We could also standardize the risk estimates from the Cox or a pooled logistic regression model. Here we’ll standardize to the distribution of covariates (well, the single one) in the whole population at baseline.

expanded_data_censored_confounded_weekly <- expanded_data_censored_confounded %>%
  # make a row for every possible week
  crossing(week = 5:45) %>%
  # then only keep those weeks in between time_in and time_out
  filter(week >= time_in & week <= ceiling(time_out)) %>%
  group_by(id, time_zero, exposed) %>%
  # only assign event to the last row (when it actually happened)
  mutate(event = case_when(
    row_number() == n() ~ event,
    TRUE ~ 0
  )) %>% 
  ungroup()

# this will take some time to run... it's a really big dataset!
glm_mod_adjusted <- glm(event ~ splines::ns(week, 4)*confounder + exposed + exposed:time_zero,
               data = expanded_data_censored_confounded_weekly,
               family = binomial())

# expand the dataset so every person has a row for every week and every time zero
glm_to_plot_adjusted <- dat_confounded %>% 
  crossing(exposed = c(0, 1), week = 5:20, time_zero = 5:19) %>% 
  filter(week >= time_zero)

glm_to_plot_adjusted$hazard <- predict(glm_mod_adjusted, newdata = glm_to_plot_adjusted, type = "response")

glm_risks_adjusted <- glm_to_plot_adjusted %>%
  group_by(id, time_zero) %>% 
  arrange(week) %>% 
  mutate(int_surv = 1 - hazard,
         surv = cumprod(int_surv), 
         risk = 1 - surv) %>%
  ungroup() %>%
  filter(week == 20) %>%
  group_by(exposed, time_zero) %>% 
  summarise(risk = mean(risk), .groups = "drop") %>% 
  mutate(exposed = factor(exposed, labels = c("no", "yes")))

glm_risks_adjusted %>%
  mutate(week = time_zero) %>% 
  ggplot(aes(exposed, risk, color = exposed)) +
  geom_point() +
  facet_grid(switch = "x", cols = vars(week), labeller = label_both) +
  labs(y = "Risk of spontaneous abortion", x = NULL) +
  theme(legend.position = "bottom", axis.text.x = element_blank())

We can also compute risk ratios directly from the standardized risks:

glm_risks_adjusted %>% 
  pivot_wider(names_from = exposed, values_from = risk, names_prefix = "risk_") %>% 
  mutate(risk_ratio = risk_yes / risk_no)
## # A tibble: 15 × 4
##    time_zero risk_no risk_yes risk_ratio
##        <int>   <dbl>    <dbl>      <dbl>
##  1         5 0.240     0.251        1.04
##  2         6 0.212     0.222        1.05
##  3         7 0.187     0.197        1.05
##  4         8 0.165     0.174        1.05
##  5         9 0.145     0.153        1.06
##  6        10 0.126     0.134        1.07
##  7        11 0.108     0.116        1.07
##  8        12 0.0911    0.0986       1.08
##  9        13 0.0744    0.0815       1.10
## 10        14 0.0584    0.0651       1.12
## 11        15 0.0436    0.0499       1.15
## 12        16 0.0306    0.0366       1.20
## 13        17 0.0199    0.0256       1.29
## 14        18 0.0116    0.0169       1.46
## 15        19 0.00544   0.0105       1.92

Again, this process should be bootstrapped to estimate confidence intervals.

Appendix: Bootstrap

First, we turn the whole procedure into a function. We’ll have to repeat it many, many times, so if there are opportunities to speed up your code, take them! This code was written for readibility, not speed…

get_standardized_risks <- function(dat_confounded) {
  
  # clone every observation twice (exposed + unexposed) for every time zero
  expanded_data_confounded <- crossing(dat_confounded, time_zero = 5:19, 
                                       exposed = c(0, 1)) %>%
    # but don't include if not eligible:
    filter(!(
      (time_ended <= time_zero) | # pregnancy ended before time zero
        (!is.na(time_exposed) & time_exposed < time_zero) # exposed before the time zero
    ))
  
  expanded_data_censored_confounded <- expanded_data_confounded %>% 
    mutate(time_in = time_zero,
           # event or censoring time
           time_out = case_when(
             # never exposed, but event during grace period (counts for both)
             is.na(time_exposed) & time_ended < time_in + 1 ~ time_ended,
             # assigned to exposed, but don't do it before grace period ends
             exposed == 1 & is.na(time_exposed) ~ time_in + 1,
             exposed == 1 & time_exposed >= time_in + 1 ~ time_in + 1,
             # assigned to unexposed, but later get exposed
             exposed == 0 & !is.na(time_exposed) ~ time_exposed,
             # everyone else
             TRUE ~ time_ended
           ),
           event = case_when(
             time_out < time_ended ~ 0,
             TRUE ~ 1
           )
    ) %>% 
    # exclude those censored immediately (time in = time out)
    filter(time_in < time_out)
  
  expanded_data_censored_confounded_weekly <- expanded_data_censored_confounded %>%
    # make a row for every possible week
    crossing(week = 5:45) %>%
    # then only keep those weeks in between time_in and time_out
    filter(week >= time_in & week <= ceiling(time_out)) %>%
    group_by(id, time_zero, exposed) %>%
    # only assign event to the last row (when it actually happened)
    mutate(event = case_when(
      row_number() == n() ~ event,
      TRUE ~ 0
    )) %>% 
    ungroup()
  
  glm_mod_adjusted <- glm(event ~ splines::ns(week, 4)*confounder + exposed + exposed:time_zero,
                          data = expanded_data_censored_confounded_weekly,
                          family = binomial())
  
  glm_to_plot_adjusted <- dat_confounded %>% 
    crossing(exposed = c(0, 1), week = 5:20, time_zero = 5:19) %>% 
    filter(week >= time_zero)
  
  glm_to_plot_adjusted$hazard <- predict(glm_mod_adjusted, newdata = glm_to_plot_adjusted, type = "response")
  
  glm_risks_adjusted <- glm_to_plot_adjusted %>%
    group_by(id, time_zero) %>% 
    arrange(week) %>% 
    mutate(int_surv = 1 - hazard,
           surv = cumprod(int_surv), 
           risk = 1 - surv) %>%
    ungroup() %>%
    filter(week == 20) %>%
    group_by(exposed, time_zero) %>% 
    summarise(risk = mean(risk), .groups = "drop") %>% 
    mutate(exposed = factor(exposed, labels = c("no", "yes")))
  
  # putting in this form for the bootstrap function
  # ie columns for term and estimate
  glm_risks_adjusted %>% 
    pivot_wider(names_from = exposed, values_from = risk, names_prefix = "risk_") %>% 
    mutate(risk_ratio = risk_yes / risk_no) %>% 
    pivot_longer(-time_zero, values_to = "estimate") %>% 
    unite(col = "term", name, time_zero)
}

The rsample package makes bootstrapping easy. You can read more about it here.

library(rsample)

# you obviously need many more bootstrap samples!
boots <- bootstraps(dat_confounded, times = 2)

boot_models <- boots %>% 
  mutate(risks = map(splits, ~get_standardized_risks(analysis(.x))))

bootstrapped_ests <- boot_models %>% 
  # compute percentile confidence intervals
  int_pctl(risks) %>% 
  separate(term, into = c("risk", "type", "time_zero")) %>% 
  mutate(time_zero = parse_number(time_zero)) %>% 
  select(-risk)

bootstrapped_ests %>% 
  filter(type %in% c("no", "yes")) %>% 
  mutate(week = time_zero) %>% 
  ggplot(aes(x = type, y = .estimate, ymin = .lower, ymax = .upper, color = type)) +
  geom_point() +
  geom_errorbar() +
    facet_grid(switch = "x", cols = vars(week), labeller = label_both) +
  labs(y = "Risk of spontaneous abortion", x = NULL, color = "exposed") +
  theme(legend.position = "bottom", axis.text.x = element_blank())

bootstrapped_ests %>% 
  filter(type == "ratio") %>% 
  ggplot(aes(x = time_zero, y = .estimate, ymin = .lower, ymax = .upper)) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "darkgrey") +
  geom_point() +
  geom_errorbar() +
  scale_y_log10() +
  labs(x = "Week of infection", y = "Risk ratio")