Factors associated with within-individual variability of lung function for people with cystic fibrosis: a longitudinal registry study

This is the code for the paper “Factors associated with within-individual variability of lung function for people with cystic fibrosis: a longitudinal registry study” by Palma et al., available as a preprint on medrxiv: https://www.medrxiv.org/content/10.1101/2023.05.12.23289768v1.

library(pacman)

p_load(dplyr, tidyr, ggplot2, lubridate, knitr, kableExtra, stringr, ggpubr, ggridges)  #, devtools
p_load(pipeR, DiagrammeR, DiagrammeRsvg, magrittr, rsvg)
p_load(brms, brmsmargins, posterior, bayesplot, tidybayes, survival, emmeans)

theme_set(theme_light(base_size = 14) + theme(panel.grid.minor = element_blank(),
    plot.margin = margin(t = 6, r = 6, b = 6, l = 6, unit = "pt"), legend.position = "bottom",
    legend.margin = margin(0, 0, 0, 0, unit = "pt"), legend.box.margin = margin(0,
        0, 0, 0, unit = "pt")))


"%ni%" <- Negate("%in%")

var_long <- "gli_fev"

Introduction

Dataset

Import the dataset and store it in the cfdata object.

cfdata_clean <- cfdata %>%
    mutate(s01encounterdate = lubridate::dmy(s01encounterdate), dmg_dateofdeath = lubridate::dmy(dmg_dateofdeath),
        dmg_dateofbirth = lubridate::dmy(dmg_dateofbirth)) %>%
    mutate_at(c("patient_id", "sitecode_anon", "dmg_sex", "dmg_ethnicity_gli", "s01isthisanannualreviewencounter",
        "s05culturespeciesstaph", "s05culturespeciespseudoaeruginos", "s07pancreaticenzymesupplements"),
        list(~factor(.)))

rm(cfdata)

The number of rows is 166117 and the number of variables is 1357. The number of unique patient ID is 13899.

Subset used for the analysis - inclusion criteria

We have kept only the first visit per year per each patient. To account for all transplants, you need to first compute transplanted indicators, then remove missing values (often, in the year of transplant gli_fev is missing).

explanatory_before <- c("patient_id", "age", "dmg_sex", "dmg_ageatdiagnosis", "dmg_dateofbirth", "f508_class")



cf2 <- cfdata_clean %>>%
  (~ c(nrow(.), length(unique(.$patient_id))) -> a1) %>>%
  filter(age >= 18 & age <= 50) %>>%
  (~ c(nrow(.), length(unique(.$patient_id))) -> a2) %>>%
  group_by(patient_id) %>% filter(year != lag(year) | row_number() == 1) %>>%
  (~ c(nrow(.), length(unique(.$patient_id))) -> a3) %>>%
  group_by(patient_id) %>% 
  mutate(Transplanted = cumsum(s10transplantsincelastannualrevi == "Y")) %>% 
  filter(Transplanted == 0) %>>%  
  (~ c(nrow(.), length(unique(.$patient_id))) -> a4)


cf2_bypatient <- cf2 %>% 
  select(patient_id, dmg_dateofbirth, dmg_sex, dmg_ageatdiagnosis, f508_class) %>% 
  distinct(across(everything())) %>%
  drop_na() %>% ##add_count(patient_id, name = "no_records") 
  mutate("diagnosedafter1yo" = factor(ifelse(dmg_ageatdiagnosis>1, 1, 0)),
         "f508_homozygous" = factor(ifelse(f508_class == "Homoz", 1, 0)),
         "yob" = year(dmg_dateofbirth))


cf2 <- left_join(cf2, 
                 select(cf2_bypatient, patient_id, diagnosedafter1yo, f508_homozygous, yob),
                 by = "patient_id")




explanatory_after <- c("patient_id", "age", "dmg_sex", "diagnosedafter1yo", "dmg_dateofbirth", "f508_homozygous")
dependent <-  var_long

cf2%>%
  finalfit::missing_pattern(dependent, explanatory_before, rotate.names = TRUE)

##       patient_id age dmg_dateofbirth dmg_sex f508_class gli_fev
## 50530          1   1               1       1          1       1
## 17878          1   1               1       1          1       1
## 3223           1   1               1       1          1       0
## 1573           1   1               1       1          1       0
##                0   0               0       0          0    4796
##       dmg_ageatdiagnosis      
## 50530                  1     0
## 17878                  0     1
## 3223                   1     1
## 1573                   0     2
##                    19451 24247

cf2 <- cf2 %>%
  filter(!!sym(var_long) > 0)  %>%
  drop_na(all_of(explanatory_after)) %>>%
  (~ c(nrow(.), length(unique(.$patient_id))) -> a5)

inclusion_diagram <- DiagrammeR::grViz("digraph {
  graph [layout = dot, rankdir = TB]

  node [shape = rectangle, fixedsize=true, width=4]
  rec1 [label = 'Original dataset\\n(N = @@6, n = @@1)']
  rec2 [label = 'Adults (18-49 years old)\\n(N = @@7, n = @@2)']
  rec3 [label = 'First review every year\\n(N = @@8, n = @@3)']
  rec4 [label = 'Annual reviews before lung transplant\\n(N = @@9, n = @@4)']
  rec5 [label = 'Non missing values of FEV1 and covariates\\n(N = @@10, n = @@5)']

  # edge definitions with the node IDs
  rec1 -> rec2 -> rec3 -> rec4 -> rec5   
  }

  [1]: a1
  [2]: a2
  [3]: a3
  [4]: a4
  [5]: a5
  [6]: a1[2]
  [7]: a2[2]
  [8]: a3[2]
  [9]: a4[2]
  [10]: a5[2]
  ")


inclusion_diagram

inclusion_diagram %>>% print(.) %>>% export_svg %>%
    charToRaw %>%
    rsvg_pdf("CFAdult_inclusiondiagram.pdf", width = 350)

Plots after inclusion criteria

p1 <- cf2[1:3971,] %>%    #3971  
  drop_na(age, all_of(var_long)) %>%
  ggplot(data = ., 
         aes_string(x = "age", y = var_long,  group = "patient_id"  #,color = "grey1"
         )
  ) + 
  geom_point(alpha = 0.05) + 
  geom_line(alpha = 0.05) +
  geom_smooth(data = cf2, 
              aes(x = age, y = get(var_long), group = dmg_sex, color = dmg_sex, fill = dmg_sex, linetype = dmg_sex),
              size = 1.25) +
  theme(legend.position = "bottom" ,
        axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        plot.margin = margin(t = 6, r = 6, b = 0, l = 6, 
                             unit = "pt") 
  ) + 
  scale_x_continuous(limits = c(18,50), 
                     breaks = scales::pretty_breaks(n = 10), 
                     expand = expansion(add = 0)) +
  scale_y_continuous(limits = c(0, NA), expand = expansion(add = 0.01)) + 
  scale_color_manual(breaks = c("F", "M"), values = c("#F8766D", "#001D81")) + 
  scale_fill_manual(breaks = c("F", "M"), values = c("#F8766D", "#001D81")) + 
  scale_linetype_manual(breaks = c("F", "M"), values = c("81", "11"), 
                        na.value = "solid") +
  labs(x = "Age",
       y = bquote(FEV[1]),
       color = "Sex",
       fill = "Sex",
       linetype = "Sex"
) 


p2 <- ggplot(cf2) +
  geom_density(aes(age, after_stat(count), fill = dmg_sex), 
               position = "fill", alpha = 0.5, adjust = 1/2) + 
  stat_ecdf(aes(age, color = dmg_sex, linetype = dmg_sex), 
            geom = "step", pad = FALSE, lwd = 1.25) + 
  geom_hline(aes(yintercept = 0.5), lty = 2) +  
  labs(x = "Age", 
       y = "Sex distribution", 
       fill = "Sex", linetype = "Sex") +
  scale_x_continuous(limits = c(18,50), 
                     breaks = scales::pretty_breaks(n = 10), 
                     expand = expansion(add = 0)) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 5), 
                     labels = scales::percent, 
                     expand = expansion(add = 0)) +
  scale_color_manual(breaks = c("F", "M"), values = c("#F8766D", "#001D81")) + 
  scale_fill_manual(breaks = c("F", "M"), values = c("#F8766D", "#001D81")) +
  scale_linetype_manual(breaks = c("F", "M"), values = c("81", "11"),
                        na.value = "solid") +
  theme(legend.position="none", 
        plot.margin = margin(t = 0, r = 6, b = 6, l = 6, 
                             unit = "pt")
  )

ggpubr::ggarrange(p1, NULL, p2, 
                  heights = c(1.2, 0.05, 1),
                  ncol = 1, nrow = 3, align = "v", 
                  common.legend = TRUE, legend = "bottom")

rm(p1,p2)

cf2 %>%
    group_by(patient_id) %>%
    summarise(no_encounters = n()) %>%
    ggplot(., aes(x = no_encounters)) + geom_bar(width = 0.5) + scale_x_continuous(limits = c(0,
    NA), breaks = scales::pretty_breaks(n = 8), expand = expansion(add = 0)) + scale_y_continuous(limits = c(0,
    NA), breaks = seq(0, 2000, by = 100), expand = expansion(add = 10)) + labs(x = "Annual reviews",
    y = "Number of individuals")

Table 1

tab1_prep <- cf2 %>%
    group_by(patient_id) %>%
    summarise(dmg_sex, no_encounters = n(), ageatdiagnosis = first(dmg_ageatdiagnosis),
        diagnosedafter1yo = first(diagnosedafter1yo), f508_homozygous = first(f508_homozygous),
        ageatfirstvisit = first(age), ageatlastvisit = last(age), death_status = !is.na(dmg_dateofdeath),
        yearofbirth = first(yob), first_fev = first(!!sym(var_long)), last_fev = last(!!sym(var_long))) %>%
    distinct(across(everything()))

quantile_interval <- function(x) {
    qts <- round(quantile(x, probs = c(0.25, 0.5, 0.75), na.rm = T), 2)
    paste0(qts[2], " (", qts[1], "; ", qts[3], ")")
}





tab1_tot <- tab1_prep %>%
    mutate(dmg_sex = "Total") %>%
    group_by(dmg_sex) %>%
    summarise(`Number of individuals` = n_distinct(patient_id), `Median number of encounters per individual (IQR)` = quantile_interval(no_encounters),
        `Difference in years between first and last encounter date (IQR)` = quantile_interval(ageatlastvisit -
            ageatfirstvisit), `Median age at diagnosis in years (IQR)` = quantile_interval(ageatdiagnosis),
        `No of individuals diagnosed within the first year after birth (%)` = paste0(sum(diagnosedafter1yo ==
            0, na.rm = T), " (", round(100 * mean(diagnosedafter1yo == 0, na.rm = T),
            2), ")"), `No of individuals homozygous for F508 (%)` = paste0(sum(f508_homozygous ==
            1, na.rm = T), " (", round(100 * mean(f508_homozygous == 1, na.rm = T),
            2), ")"), `No of individuals with one FEV1 measure (%)` = paste0(sum(no_encounters ==
            1), " (", round(100 * mean(no_encounters == 1), 2), ")"), `FEV1 at first encounter (IQR)` = quantile_interval(first_fev),
        `FEV1 at last encounter (IQR)` = quantile_interval(last_fev), `Number of deaths (%)` = paste0(sum(death_status ==
            1), " (", round(100 * mean(death_status == 1), 2), ")"))





tab1 <- tab1_prep %>%
    group_by(dmg_sex) %>%
    summarise(`Number of individuals` = n_distinct(patient_id), `Median number of encounters per individual (IQR)` = quantile_interval(no_encounters),
        `Difference in years between first and last encounter date (IQR)` = quantile_interval(ageatlastvisit -
            ageatfirstvisit), `Median age at diagnosis in years (IQR)` = quantile_interval(ageatdiagnosis),
        `No of individuals diagnosed within the first year after birth (%)` = paste0(sum(diagnosedafter1yo ==
            0, na.rm = T), " (", round(100 * mean(diagnosedafter1yo == 0, na.rm = T),
            2), ")"), `No of individuals homozygous for F508 (%)` = paste0(sum(f508_homozygous ==
            1, na.rm = T), " (", round(100 * mean(f508_homozygous == 1, na.rm = T),
            2), ")"), `No of individuals with one FEV1 measure (%)` = paste0(sum(no_encounters ==
            1), " (", round(100 * mean(no_encounters == 1), 2), ")"), `FEV1 at first encounter (IQR)` = quantile_interval(first_fev),
        `FEV1 at last encounter (IQR)` = quantile_interval(last_fev), `Number of deaths (%)` = paste0(sum(death_status ==
            1), " (", round(100 * mean(death_status == 1), 2), ")"))




rbind(tab1, tab1_tot) %>%
    t() %>%
    kbl() %>%
    kable_paper("hover", html_font = "helvetica", full_width = F)

dmg_sex	F	M	Total
Number of individuals	3287	3814	7099
Median number of encounters per individual (IQR)	8 (5; 12)	9 (5; 13)	9 (5; 13)
Difference in years between first and last encounter date (IQR)	8 (4.15; 12.62)	8.87 (4.83; 14.06)	8.47 (4.53; 13.46)
Median age at diagnosis in years (IQR)	0.67 (0.12; 4.04)	0.53 (0.12; 3.97)	0.58 (0.12; 3.99)
No of individuals diagnosed within the first year after birth (%)	1807 (54.97)	2190 (57.42)	3997 (56.29)
No of individuals homozygous for F508 (%)	1589 (48.34)	1924 (50.45)	3513 (49.47)
No of individuals with one FEV1 measure (%)	176 (5.35)	191 (5.01)	367 (5.17)
FEV1 at first encounter (IQR)	2.02 (1.41; 2.63)	2.9 (1.98; 3.7)	2.4 (1.64; 3.21)
FEV1 at last encounter (IQR)	1.49 (0.9; 2.28)	2.15 (1.23; 3.23)	1.8 (1.05; 2.78)
Number of deaths (%)	1028 (31.27)	987 (25.88)	2015 (28.38)

MELSM analysis

Basic model (from Hedeker et al., 2008) : the submodel for the outcome mean includes as covariates age and sex with a random slope per subject; the submodel for the exponential of the standard deviation includes a random effect per subject. We also estimate the correlation between group-level effects.

\(y_{ij} = x_{ij}^\mathsf{T}\beta + \upsilon_i + \varepsilon_{ij} \qquad \qquad i = 1,\dots N \text{ subjects, } j = 1,\dots, n_i \text{ visits}\)
\(\varepsilon_{ij}\) independent of \(\upsilon_{i}, \; \forall i\)
between-subject variance: \(\upsilon_i \sim N(0, \sigma_{\upsilon_{i}}^2), \qquad \sigma_{\upsilon_{i}}^2 =\exp{(u_i^\mathsf{T}\alpha)}\)
within-subjects variance: \(\varepsilon_{ij} \sim N(0, \sigma_{\varepsilon_{ij}}^2), \qquad \sigma_{\varepsilon_{ij}}^2 = \exp{(w_{ij}^\mathsf{T}\tau + \omega_i)}\)
random effects are correlated via \(\rho\): \(\begin{pmatrix}\upsilon_i\\\omega_{i}\end{pmatrix} \sim N \begin{bmatrix} \begin{pmatrix}0\\ 0\end{pmatrix}\!\!, \begin{pmatrix} \sigma_{\upsilon}^2 & \rho\sigma_{\upsilon}\sigma_{\omega} \\ & \sigma_{\omega}^2 \end{pmatrix} \end{bmatrix}\)

For the complete model used in this analysis, see the Appendix of the paper.

By default, natural cubic regression spline (bs = ‘cr’) are defined by a modest sized set of knots spread evenly (by quantiles) through the covariate values. They are penalized by the conventional intergrated square second derivative cubic spline penalty (https://stat.ethz.ch/R-manual/R-devel/library/mgcv/html/smooth.terms.html). B-splines family (bs = ‘bs’, bs = ‘ps’, bs = ‘ad’) place knots evenly.

A list of the following objects:

bform: it specifies the formula to be used in brm. Being a distributional model, we specify a model for the outcome gli_fev and one for the parameter sigma. The family of distributions is Gaussian.
prior_fitupdate: it contains the prior specifications for the model.
brms_fitprior: it fits the model specified in bform with the prior specification in prior_fitupdate on the dataset data_fit with 2400 iterations (of which 1200 warm-up). First, only the prior is sampled (in case some prior checks are needed), than the model is updated in brms_fit1. Watanabe AIC and loo can be then added (useful for model comparison).

The factor-smooth interaction in brms, derived from the mgcv package, specifies a different smoothing function for each sex with a centering constraint with respect to the model intercept. Therefore, to account for sex mean differences, a main parametric term for sex is also included ( https://stat.ethz.ch/Rmanual/R-devel/library/mgcv/html/gam.models.html).

###Note: this block takes a long time to run! Do not run unless you need to fit the model. You might also want to store the brms_fit1 model in a .rds file or a workspace.


seed1 <- 787878


bform <- bf(
  gli_fev ~ dmg_sex + s(age, by = dmg_sex, bs = 'cr', k = 5)  
  + diagnosedafter1yo 
  + s(yob, bs = 'cr', k = 5)
  + f508_homozygous 
  + (1 |ID| patient_id),  
  sigma ~ dmg_sex + s(age, by = dmg_sex, bs = 'cr', k = 5)
  + diagnosedafter1yo 
  + s(yob, bs = 'cr', k = 5)
  + f508_homozygous
  + (1 |ID| patient_id), 
  family = gaussian()
)




prior_fitupdate <- c(prior(normal(2, 0.01), class = Intercept),
                                   prior(normal(0, 0.01), class = b),
                                   prior(exponential(10), class = sd),
                                   
                                   prior(normal(0, 0.1), class = Intercept, dpar = sigma),
                                   prior(normal(0, 0.1), class = b, dpar = sigma),
                                   prior(exponential(10), class = sd, dpar = sigma),
                                   
                                   prior(exponential(10), class = sds),
                                   prior(exponential(10), class = sds, dpar = sigma),
                                   prior(lkj(2), class = cor))




brms_fitprior <- brm(bform,
                       data = data_fit,
                       seed = seed1,
                       prior = prior_fitupdate,    
                       warmup = 1200,
                       iter = 2400,
                       thin = 5,
                       chains = 2, cores = 2
                       , sample_prior = "only"
                       , control = list(adapt_delta = 0.95)
                     )


fit1_time <- system.time(
  brms_fit1 <-  update(brms_fitprior, sample_prior="no", cores = 2, seed = seed1)
)    

brms_fit1 <- add_criterion(brms_fit1, c("waic", "loo"), cores = 2)

Model summary

The package brms uses only complete cases, that is, the subjects with no NAs in the covariates selected for the model.

summary(brms_fit1)

##  Family: gaussian 
##   Links: mu = identity; sigma = log 
## Formula: gli_fev ~ dmg_sex + s(age, by = dmg_sex, bs = "cr", k = 5) + diagnosedafter1yo + s(yob, bs = "cr", k = 5) + f508_homozygous + (1 | ID | patient_id) 
##          sigma ~ dmg_sex + s(age, by = dmg_sex, bs = "cr", k = 5) + diagnosedafter1yo + s(yob, bs = "cr", k = 5) + f508_homozygous + (1 | ID | patient_id)
##    Data: data_fit (Number of observations: 65522) 
##   Draws: 2 chains, each with iter = 2400; warmup = 1200; thin = 5;
##          total post-warmup draws = 480
## 
## Smooth Terms: 
##                           Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## sds(sagedmg_sexF_1)           0.02      0.01     0.01     0.06 1.00      480
## sds(sagedmg_sexM_1)           0.03      0.02     0.01     0.08 1.01      458
## sds(syob_1)                   0.12      0.05     0.05     0.25 1.01      174
## sds(sigma_sagedmg_sexF_1)     0.11      0.04     0.05     0.21 1.00      430
## sds(sigma_sagedmg_sexM_1)     0.10      0.05     0.05     0.22 1.00      463
## sds(sigma_syob_1)             0.12      0.06     0.05     0.27 1.00      406
##                           Tail_ESS
## sds(sagedmg_sexF_1)            494
## sds(sagedmg_sexM_1)            389
## sds(syob_1)                    445
## sds(sigma_sagedmg_sexF_1)      494
## sds(sigma_sagedmg_sexM_1)      477
## sds(sigma_syob_1)              363
## 
## Group-Level Effects: 
## ~patient_id (Number of levels: 7099) 
##                                Estimate Est.Error l-95% CI u-95% CI Rhat
## sd(Intercept)                      1.04      0.01     1.02     1.06 1.05
## sd(sigma_Intercept)                0.45      0.01     0.44     0.46 1.00
## cor(Intercept,sigma_Intercept)     0.16      0.02     0.13     0.20 1.00
##                                Bulk_ESS Tail_ESS
## sd(Intercept)                        73      138
## sd(sigma_Intercept)                 404      494
## cor(Intercept,sigma_Intercept)      399      492
## 
## Population-Level Effects: 
##                          Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept                    2.01      0.01     1.99     2.03 1.01       83
## sigma_Intercept             -1.48      0.01    -1.51    -1.46 1.00      336
## dmg_sexM                     0.10      0.01     0.08     0.12 1.02      153
## diagnosedafter1yo1           0.05      0.01     0.03     0.07 1.00      229
## f508_homozygous1            -0.05      0.01    -0.06    -0.03 1.01      148
## sigma_dmg_sexM               0.25      0.01     0.22     0.27 1.00      465
## sigma_diagnosedafter1yo1    -0.04      0.01    -0.07    -0.02 1.00      239
## sigma_f508_homozygous1       0.06      0.01     0.03     0.08 1.00      443
## sage:dmg_sexF_1             -0.27      0.00    -0.27    -0.26 1.00      375
## sage:dmg_sexM_1             -0.29      0.00    -0.30    -0.28 1.00      459
## syob_1                       0.00      0.01    -0.02     0.02 1.00      355
## sigma_sage:dmg_sexF_1       -0.09      0.01    -0.11    -0.06 1.00      435
## sigma_sage:dmg_sexM_1       -0.08      0.01    -0.10    -0.06 1.00      461
## sigma_syob_1                -0.12      0.03    -0.19    -0.06 1.00      449
##                          Tail_ESS
## Intercept                     188
## sigma_Intercept               492
## dmg_sexM                      315
## diagnosedafter1yo1            299
## f508_homozygous1              356
## sigma_dmg_sexM                358
## sigma_diagnosedafter1yo1      419
## sigma_f508_homozygous1        469
## sage:dmg_sexF_1               465
## sage:dmg_sexM_1               453
## syob_1                        358
## sigma_sage:dmg_sexF_1         436
## sigma_sage:dmg_sexM_1         422
## sigma_syob_1                  378
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Fitted values for individuals within the dataset

To derive fitted values, a reference guide is in https://www.andrewheiss.com/blog/2022/09/26/guide-visualizing-types-posteriors/#tldr-diagrams-and-cheat-sheets and https://www.andrewheiss.com/blog/2021/11/10/ame-bayes-re-guide/. In particular, if you use re_formula = NULL in the fitted command, you are including the random effects, while if you use re_formula = NA you are only getting the grand mean (given the covariates). The smooth means of the fitted values with re_formula = NULL and the smooth means of the fitted values with re_formula = NA are exactly the same because the mean of random effects is equal to zero.

fitted_mu <- fitted(brms_fit1, re_formula = NULL, scale  = "response", dpar = "mu") %>%
  as.data.frame()
colnames(fitted_mu) <- paste0("mu_", colnames(fitted_mu))
fitted_sigma <- fitted(brms_fit1, re_formula = NULL, scale  = "response", dpar = "sigma") %>%
  as.data.frame()
colnames(fitted_sigma) <- paste0("sigma_", colnames(fitted_sigma))


explanatory_after <- c("patient_id", "age", "dmg_sex", "diagnosedafter1yo", "dmg_dateofbirth", "f508_homozygous")
dependent <-  "gli_fev"


fittedpars_withcovariates <- data_fit %>% 
  select(all_of(c(explanatory_after, dependent, "dmg_deathhaspatientdied", "Transplanted"))) %>% 
  drop_na() %>% 
  cbind(., fitted_mu, fitted_sigma)

fittedpars_withcovariates_reshaped <- fittedpars_withcovariates  %>%
  gather(dpar, Estimate, c("mu_Estimate", "sigma_Estimate")) %>%
  mutate(ymin = ifelse(dpar == "sigma_Estimate", 0, 0.95),
         ymax = ifelse(dpar == "sigma_Estimate", 0.7, 3.5)) %>%###ymin to make sure that y-axis in sigma subplots includes 0
  select(-ends_with(c("Est.Error", "Q2.5", "Q97.5")))


###Plots
fittedpars_withcovariates_reshaped %>%
  filter(patient_id %in% sample(fittedpars_withcovariates_reshaped$patient_id, 300)
         & dpar == "sigma_Estimate") %>%
  ggplot(aes(x = age, y = Estimate, group = patient_id, color = diagnosedafter1yo)) +
  geom_line(alpha = 0.2) +
  facet_grid( ~ dmg_sex, scales = "free") + 
  geom_smooth(data = filter(fittedpars_withcovariates_reshaped, dpar == "sigma_Estimate"), 
              aes(x = age, y = Estimate, group = diagnosedafter1yo, color = diagnosedafter1yo),
              linewidth = 1.25) +  
  scale_color_manual(values = c("#5F4B8BFF", "#E69A8DFF"))


fittedpars_withcovariates_reshaped$dpar <- factor(fittedpars_withcovariates_reshaped$dpar,
        labels = c(~ list("FEV"[1]~"mean"), ~ list("FEV"[1]~"SD")))

fittedpars_withcovariates_reshaped$dmg_sex <- factor(fittedpars_withcovariates_reshaped$dmg_sex,
        labels = c(~ list("Females"), ~ list("Males")))



fittedpars_withcovariates_reshaped %>%
  ggplot(aes(x = age, y = Estimate)) +
  facet_grid(dpar ~ dmg_sex, scales = "free",  labeller = label_parsed) +
  geom_smooth(aes(x = age, y = Estimate, 
                  group = dmg_deathhaspatientdied, 
                  color = dmg_deathhaspatientdied,
                  fill = dmg_deathhaspatientdied), 
              size = 0, lty = 0, alpha = 0) +
  geom_smooth(size = 1, col = 1, lty = 1) +
  scale_color_manual(values = c( "#E69A8DFF","#5F4B8BFF"), labels = c("", "")) +
  scale_fill_manual(values = c("#E69A8DFF", "#5F4B8BFF"), labels = c("", "")) +
  geom_blank(aes(y = ymin)) + geom_blank(aes(y = ymax)) +
  labs(x = "Age", y = "Average trajectory", fill = "", color = "", title = "")

fittedpars_withcovariates_reshaped %>%
  ggplot(aes(x = age, y = Estimate, 
             group = dmg_deathhaspatientdied, 
             color = dmg_deathhaspatientdied, 
             fill = dmg_deathhaspatientdied,
             lty = dmg_deathhaspatientdied)) +
  facet_grid(dpar ~ dmg_sex, scales = "free",  labeller = label_parsed) + 
  geom_smooth(size = 1) +  
  scale_color_manual(values = c("#445C40", "#E69A8DFF"), labels = c("No", "Yes")) +
  scale_fill_manual(values = c("#445C40", "#E69A8DFF"), labels = c("No", "Yes")) + 
  scale_linetype_manual(values = c("11", "81"), labels = c("No", "Yes")) + 
  geom_blank(aes(y = ymin)) + geom_blank(aes(y = ymax)) +
  xlim(18, 50) +
  labs(x = "Age", y = "Average trajectory", fill = "Died during follow-up", color = "Died during follow-up", lty = "Died during follow-up", title = "")

Predicted values for specific covariate values

As in the previous section, if you specify re_formula = NA you do not include any ID (you are getting predictions for an average individual). If you need conditional effects for existing individuals, then use re_formula = NULL or equivalently re_formula = ~ (1 | patient_id)). If you want a new hypothetical individual, you can use patient_id = NA, allow_new_levels = TRUE, re_formula = ~ (1 | patient_ID) and then you specify sample_new_levels either equal to “uncertainty” or “gaussian” in the prediction function.

yob_levels <- c(1963, 1984, 1993)


epred_byage <- brms_fit1 %>% 
  epred_draws(newdata = expand.grid(dmg_sex = c("F", "M"),
                      age = seq(18,50,by=2),
                      diagnosedafter1yo = c(0,1),
                      yob = yob_levels,
                      f508_homozygous = c(0,1), patient_id = "B155916"),  
              re_formula = NA, dpar = TRUE)


epred_byage_reshaped <- epred_byage  %>%
  gather(dpar, Estimate, c("mu", "sigma")) %>%
    mutate(ymin = ifelse(dpar == "sigma", 0, 1.05),
         ymax = ifelse(dpar == "sigma", 1, 2.75)) ###ymin to make sure that y-axis in sigma subplots includes 0

epred_byage_reshaped$dpar <- factor(epred_byage_reshaped$dpar,
        labels = c(~ list("FEV"[1]~"mean"), ~ list("FEV"[1]~"SD")))

epred_byage_reshaped$dmg_sex <- factor(epred_byage_reshaped$dmg_sex,
        labels = c(~ list("Females"), ~ list("Males")))





epred_byage_reshaped %>% filter(diagnosedafter1yo == 0, f508_homozygous == 1) %>%
  ggplot(aes(x = age, y = Estimate, 
             color = rev(factor(yob)), 
             fill = rev(factor(yob)),
             lty = rev(factor(yob)))) +
  facet_grid(dpar ~ dmg_sex, scales = "free",  labeller = label_parsed) + 
  geom_ribbon(aes(ymin = ymin, ymax = ymax), alpha = 0, colour = NA) + ###to get same y-axis across plots
  stat_lineribbon(.width = 0.95, alpha = 1/4, show.legend = TRUE) +
  scale_color_manual(values = c( "#445C40","#3E282B","#EF9806"), 
                     labels = rev(yob_levels), 
                     guide = guide_legend(reverse = TRUE)) +
  scale_fill_manual(values = c("#445C40","#3E282B","#EF9806"), 
                    labels = rev(yob_levels),
                    guide = guide_legend(reverse = TRUE)) +
  scale_linetype_manual(values = c("11", "81", "solid"), 
                        labels = rev(yob_levels),
                        guide = guide_legend(reverse = TRUE)) +
  labs(x = "Age", y = "Prediction", linetype = "Year of birth", 
       fill = "Year of birth", color = "Year of birth", 
       title = "Homozygous F508, diagnosed before 1 year old") +
  theme(plot.title=element_text(size=12))



epred_byage_reshaped %>% filter(diagnosedafter1yo == 0, yob == yob_levels[2]) %>%
    ggplot(aes(x = age, y = Estimate, 
               color = factor(f508_homozygous), 
               fill = factor(f508_homozygous), 
               lty = factor(f508_homozygous)))  +
    facet_grid(dpar ~ dmg_sex, scales = "free",  labeller = label_parsed) + 
    geom_ribbon(aes(ymin = ymin, ymax = ymax), alpha = 0, colour = NA) +
    stat_lineribbon(.width = 0.95, alpha = 1/3, show.legend = TRUE) +
    scale_color_manual(values = c("#3E282B","#EF9806"), labels = c("No", "Yes")) +
    scale_fill_manual(values = c("#3E282B","#EF9806"), labels = c("No", "Yes")) + 
    scale_linetype_manual(values = c("11", "81"), labels = c("No", "Yes")) + 
    labs(x = "Age", y = "Prediction", linetype = "Homozygous F508", 
         fill = "Homozygous F508", color = "Homozygous F508", 
         title = paste0("Born in ", yob_levels[2], ", diagnosed before 1 year old")) +
    theme(plot.title=element_text(size=12))



epred_byage_reshaped %>% filter(f508_homozygous == 1, yob == yob_levels[2]) %>%
    ggplot(aes(x = age, y = Estimate, 
               color = factor(diagnosedafter1yo), 
               fill = factor(diagnosedafter1yo), 
               lty = factor(diagnosedafter1yo)))  +
    facet_grid(dpar ~ dmg_sex, scales = "free",  labeller = label_parsed) + 
    geom_ribbon(aes(ymin = ymin, ymax = ymax), alpha = 0, colour = NA) +
    stat_lineribbon(.width = 0.95, alpha = 1/3, show.legend = TRUE) +
    scale_color_manual(values = c("#3E282B","#EF9806"), labels = c("No", "Yes")) +
    scale_fill_manual(values = c("#3E282B","#EF9806"), labels = c("No", "Yes")) + 
    scale_linetype_manual(values = c("11", "81"), labels = c("No", "Yes")) + 
    labs(x = "Age", y = "Prediction", linetype = "Diagnosed after 1 year old", 
         fill = "Diagnosed after 1 year old", color = "Diagnosed after 1 year old", 
         title = paste0("Born in ", yob_levels[2], ", homozygous F508")) +
    theme(plot.title=element_text(size=12))

Estimated coefficients

cond_smooth_data <- lapply(conditional_smooths(brms_fit1, method = "fitted"), as.data.frame) %>%
    bind_rows(.id = "coef_function") %>%
    mutate(submodel = factor(str_extract(coef_function, "[^: ]+"), labels = c("Location submodel",
        "Scale submodel")), xvar = factor(sapply(str_extract_all(coef_function, "[^(*),]+"),
        "[[", 2)), byvar = str_match(coef_function, "by=\\s*(.*?)\\s*,")[, 2])

sexmaineffect <- summary(brms_fit1)$fixed[c("dmg_sexM", "sigma_dmg_sexM"), 1]

cond_smooth_data_WITHINTERACTION <- cond_smooth_data %>%
    mutate_at(vars("estimate__", "lower__", "upper__"), funs(case_when(dmg_sex ==
        "M" & submodel == "Location submodel" ~ . + sexmaineffect[1], dmg_sex ==
        "M" & submodel == "Scale submodel" ~ . + sexmaineffect[2], TRUE ~ .)))



cond_smooth_data_WITHINTERACTION %>%
    ggplot(., aes(x = effect1__, fill = dmg_sex)) + geom_line(aes(y = estimate__,
    color = dmg_sex, linetype = dmg_sex)) + facet_grid(submodel ~ xvar, scales = "free",
    switch = "y", labeller = labeller(xvar = c(age = "Age", yob = "Year of birth"),
        submodel = c(`Location submodel` = "Mean trajectory", `Scale submodel` = "Variability (log SD)"))) +
    geom_ribbon(aes(ymin = lower__, ymax = upper__), alpha = 0.3, colour = NA) +
    labs(x = "", y = bquote("Association with FEV"[1]), fill = "Sex", color = "Sex",
        linetype = "Sex") + theme(plot.margin = margin(t = 2, r = 10, b = 2, l = 2,
    unit = "pt"), legend.position = "bottom", legend.margin = margin(0, 0, 0, 0,
    unit = "pt"), legend.box.margin = margin(0, 0, 0, 0, unit = "pt"), panel.grid.minor = element_line(),
    panel.spacing.y = unit(0.5, "lines"), axis.text = element_text(size = 10)) +
    scale_color_manual(breaks = c("F", "M"), values = c("#F8766D", "#001D81")) +
    scale_fill_manual(breaks = c("F", "M"), values = c("#F8766D", "#001D81")) + scale_linetype_manual(breaks = c("F",
    "M"), values = c("81", "11"), na.value = "solid")

Random effects

Let us look first at the standard deviation of the scale random effect. The probability that this parameter is higher than zero is very close to 1.

hyp <- c("sd_patient_id__sigma_Intercept^2 = 0")
hypothesis(brms_fit1, hyp, class = NULL)

## Hypothesis Tests for class :
##                 Hypothesis Estimate Est.Error CI.Lower CI.Upper Evid.Ratio
## 1 (sd_patient_id__s... = 0      0.2         0     0.19     0.21         NA
##   Post.Prob Star
## 1        NA    *
## ---
## 'CI': 90%-CI for one-sided and 95%-CI for two-sided hypotheses.
## '*': For one-sided hypotheses, the posterior probability exceeds 95%;
## for two-sided hypotheses, the value tested against lies outside the 95%-CI.
## Posterior probabilities of point hypotheses assume equal prior probabilities.

mcmc_dens(brms_fit1, pars = "sd_patient_id__sigma_Intercept") + labs(x = expression(sigma[omega]))

Bonus point: a subject specific random intercept is a category smoother (check https://www.tjmahr.com/random-effects-penalized-splines-same-thing/ to understand the equivalence between splines and random effects).

Now we create the dataset ranef_df joining demographic information and random effects.

visitcount <- data_fit %>%
    group_by(patient_id) %>%
    summarise(dmg_sex, no_visits = n(), ageatfirstvisit = first(age), ageatdiagnosis = first(dmg_ageatdiagnosis),
        death_status = !is.na(dmg_dateofdeath)) %>%
    distinct(across(everything()))



ranef_df0 <- ranef(brms_fit1, groups = "patient_id", probs = c(0.025, 0.975)) %>%
    data.frame()
names(ranef_df0) <- str_remove(names(ranef_df0), "patient_id.")


ranef_df <- tibble::rownames_to_column(ranef_df0, var = "patient_id") %>%
    left_join(., visitcount, by = "patient_id")

repeated_id <- ranef_df %>%
    count(patient_id) %>%
    filter(n > 1) %>%
    pull(patient_id)  ##appear twice in ranef_df ###recorded as both males and females

ranef_df <- filter(ranef_df, patient_id %ni% repeated_id)

ranef_df <- ranef_df %>%
    mutate(sub_muintercept = Estimate.Intercept + summary(brms_fit1)$fixed["Intercept",
        1], sub_sigmaintercept = exp(Estimate.sigma_Intercept + summary(brms_fit1)$fixed["sigma_Intercept",
        1]))

ggplot(data = ranef_df[1:500, ], aes(x = Estimate.Intercept, y = Estimate.sigma_Intercept)) +
    geom_linerange(aes(xmin = Q2.5.Intercept, xmax = Q97.5.Intercept), alpha = 0.1) +
    geom_linerange(aes(ymin = Q2.5.sigma_Intercept, ymax = Q97.5.sigma_Intercept),
        alpha = 0.1) + geom_point(aes(color = dmg_sex), alpha = 0.4, size = 0.9) +
    geom_smooth(data = ranef_df, aes(x = Estimate.Intercept, y = Estimate.sigma_Intercept),
        method = "lm", se = TRUE, col = 1) + geom_smooth(data = ranef_df, aes(x = Estimate.Intercept,
    y = Estimate.sigma_Intercept, color = dmg_sex, fill = dmg_sex, linetype = dmg_sex),
    method = "gam", formula = y ~ s(x, bs = "cs", k = 5), size = 1.25) + scale_color_manual(breaks = c("F",
    "M"), values = c("#F8766D", "#001D81")) + scale_fill_manual(breaks = c("F", "M"),
    values = c("#F8766D", "#001D81")) + scale_linetype_manual(breaks = c("F", "M"),
    values = c("longdash", "solid"), na.value = "solid") + labs(x = "Location random effects",
    y = "Scale random effects", color = "Sex", fill = "Sex", linetype = "Sex") +
    theme(panel.grid.major = element_blank())

ggplot(data = ranef_df, aes(x = Estimate.sigma_Intercept, y = factor(no_visits),
    fill = stat(x))) + geom_density_ridges_gradient(alpha = 0.6, rel_min_height = 0.01) +
    theme_ridges() + labs(x = "Scale random effect", y = "Number of encounters") +
    scale_fill_viridis_c(option = "C") + theme(legend.position = "none")

Posterior checks

If you want to check autocorrelation of the MCMC samples, you can run the following code.

mcmc_plot(brms_fit1, type = "acf") + theme(strip.text.x = element_text(size = 14,
    angle = 90))  ###you would like to see L-shaped plots

The model fit took approximately 20.44 hours.

Posterior predictive distribution:

\[p(\tilde{y}| y ) = \int p(\tilde{y}|\theta)p(\theta|y)d\theta\]

pp_check(brms_fit1) + theme(legend.position = "bottom")  ##density overlay
pp_check(brms_fit1, type = "ecdf_overlay")

For multiple models, model comparison is performed in terms of best predictive ability, via either Bayesian LOO-CV and Watanabe AIC (WAIC, asymptotically equivalent to LOO-CV). The best model is the one with lower LOO-CV. Larger ELPD values indicate better fit.

Session info

sessionInfo()

## R version 4.1.2 (2021-11-01)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United Kingdom.1252 
## [2] LC_CTYPE=English_United Kingdom.1252   
## [3] LC_MONETARY=English_United Kingdom.1252
## [4] LC_NUMERIC=C                           
## [5] LC_TIME=English_United Kingdom.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] emmeans_1.8.2     survival_3.4-0    tidybayes_3.0.4   bayesplot_1.10.0 
##  [5] posterior_1.4.1   brmsmargins_0.2.0 brms_2.19.0       Rcpp_1.0.8.3     
##  [9] rsvg_2.3.2        magrittr_2.0.3    DiagrammeRsvg_0.1 DiagrammeR_1.0.9 
## [13] pipeR_0.6.1.3     ggridges_0.5.4    ggpubr_0.6.0      stringr_1.5.0    
## [17] kableExtra_1.3.4  knitr_1.42        lubridate_1.8.0   ggplot2_3.4.2    
## [21] tidyr_1.3.0       dplyr_1.1.2       pacman_0.5.1     
## 
## loaded via a namespace (and not attached):
##   [1] backports_1.4.1      systemfonts_1.0.4    plyr_1.8.7          
##   [4] igraph_1.3.1         splines_4.1.2        svUnit_1.0.6        
##   [7] crosstalk_1.2.0      rstantools_2.2.0     inline_0.3.19       
##  [10] digest_0.6.29        htmltools_0.5.2      fansi_1.0.3         
##  [13] checkmate_2.1.0      extraoperators_0.1.1 RcppParallel_5.1.5  
##  [16] matrixStats_0.62.0   xts_0.12.1           svglite_2.1.0       
##  [19] prettyunits_1.1.1    colorspace_2.0-3     rvest_1.0.3         
##  [22] ggdist_3.1.1         xfun_0.39            callr_3.7.0         
##  [25] crayon_1.5.1         jsonlite_1.8.0       zoo_1.8-10          
##  [28] glue_1.6.2           gtable_0.3.0         webshot_0.5.4       
##  [31] V8_4.2.0             distributional_0.3.0 car_3.1-0           
##  [34] pkgbuild_1.3.1       rstan_2.26.13        abind_1.4-5         
##  [37] scales_1.2.0         mvtnorm_1.1-3        rstatix_0.7.2       
##  [40] miniUI_0.1.1.1       viridisLite_0.4.0    xtable_1.8-4        
##  [43] finalfit_1.0.4       stats4_4.1.2         StanHeaders_2.26.13 
##  [46] DT_0.22              datawizard_0.6.3     htmlwidgets_1.5.4   
##  [49] httr_1.4.2           threejs_0.3.3        arrayhelpers_1.1-0  
##  [52] RColorBrewer_1.1-3   ellipsis_0.3.2       mice_3.14.0         
##  [55] pkgconfig_2.0.3      loo_2.5.1            farver_2.1.0        
##  [58] sass_0.4.0           utf8_1.2.2           labeling_0.4.2      
##  [61] tidyselect_1.2.0     rlang_1.1.0          reshape2_1.4.4      
##  [64] later_1.3.0          munsell_0.5.0        tools_4.1.2         
##  [67] visNetwork_2.1.2     cli_3.4.1            generics_0.1.2      
##  [70] broom_1.0.1          evaluate_0.15        fastmap_1.1.0       
##  [73] yaml_2.3.5           processx_3.5.3       purrr_1.0.1         
##  [76] nlme_3.1-153         mime_0.12            formatR_1.12        
##  [79] xml2_1.3.3           compiler_4.1.2       shinythemes_1.2.0   
##  [82] rstudioapi_0.13      curl_4.3.2           ggsignif_0.6.3      
##  [85] tibble_3.2.1         bslib_0.3.1          stringi_1.7.6       
##  [88] highr_0.9            ps_1.6.0             Brobdingnag_1.2-7   
##  [91] forcats_0.5.1        lattice_0.20-45      Matrix_1.3-4        
##  [94] markdown_1.1         shinyjs_2.1.0        tensorA_0.36.2      
##  [97] vctrs_0.6.1          pillar_1.9.0         lifecycle_1.0.3     
## [100] jquerylib_0.1.4      bridgesampling_1.1-2 estimability_1.4.1  
## [103] cowplot_1.1.1        data.table_1.14.2    insight_0.18.6      
## [106] httpuv_1.6.5         R6_2.5.1             promises_1.2.0.1    
## [109] gridExtra_2.3        codetools_0.2-18     boot_1.3-28         
## [112] colourpicker_1.1.1   gtools_3.9.2         withr_2.5.0         
## [115] shinystan_2.6.0      mgcv_1.8-38          bayestestR_0.13.0   
## [118] parallel_4.1.2       grid_4.1.2           coda_0.19-4         
## [121] rmarkdown_2.13       carData_3.0-5        shiny_1.7.1         
## [124] base64enc_0.1-3      dygraphs_1.1.1.6