This is the code for the analysis of the UK Cystic Fibrosis Registry data in the paper "A Bayesian location-scale joint model for time-to-event and multivariate longitudinal data with association based on within-individual variability" by Palma et al. (2026).
library(pacman)
p_load(rstan, brms)
p_load(ggplot2, ggpubr, ggridges, ggsurvfit, dplyr, tidyr, pipeR, magrittr, gridExtra)
p_load(stringr, survival, lubridate, DiagrammeR)
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())
# library(devtools) install_github('marcopalma3/rstanjmwiv')
library(rstanjmwiv)
theme_set(theme_bw(base_size = 14) + theme(panel.grid.minor = element_blank(),
plot.margin = margin(t = 6, r = 6, b = 6, l = 6, unit = "pt"),
legend.position = "bottom", legend.margin = margin(0, 0, 0, 0, unit = "pt"),
legend.box.margin = margin(0, 0, 0, 0, unit = "pt")))
"%ni%" <- Negate("%in%")
options(digits = 3)
var_long <- c("gli_fev", "bmi")
var_long_labels <- c("FEV1", "BMI")
Import the dataset and store it in the cfdata object.
cfdata_clean <- cfdata %>%
mutate(s01encounterdate = lubridate::dmy(s01encounterdate),
dmg_dateofdeath = lubridate::dmy(dmg_dateofdeath),
dmg_dateofbirth = lubridate::dmy(dmg_dateofbirth)) %>%
mutate_at(c("patient_id","sitecode_anon","dmg_sex","dmg_ethnicity_gli",
"s01isthisanannualreviewencounter","s05culturespeciesstaph",
"s05culturespeciespseudoaeruginos","s07pancreaticenzymesupplements"),
list(~factor(.)))
rm(cfdata)
The number of rows is 166117 and the number of variables is 1357. The number of unique patient IDs is 13899.
We have kept only the first visit per year per each patient. To account for all transplants,
you need to first compute transplanted indicators, then remove missing values (often, in the year
of transplant gli_fev is missing).
explanatory_before <- c("patient_id","age","dmg_sex","dmg_dateofbirth","f508_class","dmg_ageatdiagnosis")
dependent <- var_long
cf2 <- cfdata_clean %>>%
(~ c(nrow(.), length(unique(.$patient_id))) -> a1) %>>%
filter(age >= 18 & age <= 50) %>>%
(~ c(nrow(.), length(unique(.$patient_id))) -> a2) %>>%
group_by(patient_id) %>% filter(year != lag(year) | row_number() == 1) %>>%
(~ c(nrow(.), length(unique(.$patient_id))) -> a3) %>>%
group_by(patient_id) %>%
mutate(Transplanted = cumsum(s10transplantsincelastannualrevi == "Y"),
EverTx = as.integer(max(Transplanted) > 0)) %>%
filter(Transplanted == 0) %>%
mutate(Age_LastEncounter = max(age),
Date_LastEncounter = max(s01encounterdate)) %>>%
(~ c(nrow(.), length(unique(.$patient_id))) -> a4)
cf2_bypatient <- cf2 %>%
select(patient_id, dmg_dateofbirth, dmg_sex, dmg_ageatdiagnosis, f508_class) %>%
distinct(across(everything())) %>%
drop_na() %>%
mutate("diagnosedafter1yo" = factor(ifelse(dmg_ageatdiagnosis > 1, 1, 0)),
"f508_homozygous" = factor(ifelse(f508_class == "Homoz", 1, 0)),
"yob" = year(dmg_dateofbirth))
cf2 <- left_join(cf2,
select(cf2_bypatient, patient_id, diagnosedafter1yo, f508_homozygous, yob),
by = "patient_id")
explanatory_after <- c("patient_id","age","dmg_sex","diagnosedafter1yo","f508_homozygous","yob")
cfLONG <- cf2 %>%
filter(!!sym(var_long[1]) > 0) %>%
filter(!!sym(var_long[2]) > 0 & !!sym(var_long[2]) < 80) %>%
drop_na(all_of(explanatory_after)) %>>%
(~ c(nrow(.), length(unique(.$patient_id))) -> a5)
cfLONG$patient_id <- droplevels(cfLONG$patient_id)
cfLONG <- ungroup(cfLONG)
Please note, for some people the date of death is recorded, but not the age at death.
Event time is equal to:
Death status is equal to 1 if the person died and never received transplant, 0 otherwise (censored).
last_deathdate_recorded <- max(cf2$dmg_dateofdeath, na.rm = TRUE)
cfSURV <- cf2 %>%
group_by(patient_id) %>%
filter(row_number() == 1) %>%
select(patient_id, dmg_sex, dmg_dateofbirth, dmg_dateofdeath, dmg_ageatdeath,
diagnosedafter1yo, f508_homozygous, yob, EverTx, Date_LastEncounter) %>%
mutate(event_time = case_when(
!is.na(dmg_dateofdeath) && EverTx == 0 ~ dmg_dateofdeath,
EverTx == 1 ~ Date_LastEncounter,
.default = last_deathdate_recorded),
death_status = ifelse(!is.na(dmg_dateofdeath) && EverTx == 0, 1, 0),
ageatevent = max(18, as.numeric(interval(dmg_dateofbirth, event_time), "years"))) %>%
filter(patient_id %in% unique(cfLONG$patient_id)) %>%
ungroup()
female_subsample <- cfLONG %>%
filter(dmg_sex == "F") %>%
pull(patient_id) %>%
unique()
cfLONG_female <- cfLONG %>%
filter(patient_id %in% female_subsample) %>%
mutate(patient_id = droplevels(patient_id),
id = as.integer(patient_id),
age_minus18 = age - 18,
bmi10 = bmi / 10) %>%
ungroup()
cfLONG_female %>%
group_by(patient_id) %>%
summarise(no_encounters = n()) %>%
pull(no_encounters) %>%
quantile(., c(0.25, 0.5, 0.75))
25% 50% 75%
4 6 10
cat("The number of individuals in the dataset is", nrow(cfSURV_female),
", for a total number of annual visits of", nrow(cfLONG_female))
The number of individuals in the dataset is 3012, for a total number of annual visits of 21910
cfSURV_female <- cfSURV %>%
filter(patient_id %in% female_subsample) %>%
mutate(patient_id = droplevels(patient_id),
id = as.integer(patient_id),
yob80 = yob - 1980,
AC50_ageatevent_minus18 = ifelse(ageatevent > 50, 50 - 18, ageatevent - 18),
AC50_ageatevent_minus18 = ifelse(AC50_ageatevent_minus18 > 0, AC50_ageatevent_minus18, 0),
AC50_death_status = ifelse(ageatevent > 50 & death_status == 1, 0, death_status)) %>%
ungroup()
cfSURV_female %>%
survfit2(Surv(AC50_ageatevent_minus18, AC50_death_status) ~ diagnosedafter1yo, data = .) %>%
ggsurvfit::ggsurvfit() + add_confidence_interval() + add_risktable() +
ylim(c(0,1)) + scale_x_continuous(breaks = seq(0,32,4)) +
labs(x = "years", y = "Overall survival probability")
cfSURV_female %>%
survfit2(Surv(AC50_ageatevent_minus18, AC50_death_status) ~ f508_homozygous, data = .) %>%
ggsurvfit::ggsurvfit() + add_confidence_interval() + add_risktable() +
ylim(c(0,1)) + scale_x_continuous(breaks = seq(0,32,4)) +
labs(x = "years", y = "Overall survival probability")
cfSURV_female %>%
survfit2(Surv(AC50_ageatevent_minus18, AC50_death_status) ~ f508_homozygous, data = .) %>%
ggsurvfit::ggsurvfit() + add_confidence_interval() + add_risktable() +
ylim(c(0,1)) + scale_x_continuous(breaks = seq(0,32,4)) +
labs(x = "years", y = "Overall survival probability")
f_fev <- list(formula(gli_fev ~ age_minus18 + diagnosedafter1yo + f508_homozygous + (1 | id)),
formula(sigma ~ age_minus18 + diagnosedafter1yo + f508_homozygous + (1 | id)))
f_bmi <- list(formula(bmi ~ age_minus18 + diagnosedafter1yo + f508_homozygous + (1 | id)),
formula(sigma ~ age_minus18 + diagnosedafter1yo + f508_homozygous + (1 | id)))
f_Event <- survival::Surv(AC50_ageatevent_minus18, AC50_death_status) ~
diagnosedafter1yo + f508_homozygous
prior_input <- list(
# y1mu_prior_mean = NULL, y2mu_prior_mean = NULL,
# y1sigma_prior_mean = NULL, y2sigma_prior_mean = NULL,
# e_prior_mean = NULL, a_prior_mean = rep(0, 4),
# ymu_prior_mean_for_intercept = NULL,
# ysigma_prior_mean_for_intercept = NULL,
# e_prior_mean_for_aux = NULL,
# y1mu_prior_scale = 5, y2mu_prior_scale = 5,
# y1sigma_prior_scale = 5, y2sigma_prior_scale = 5,
# e_prior_scale = 5, a_prior_scale = rep(2.5, 4),
# ymu_prior_scale_for_intercept = rep(0.5, 2),
# ysigma_prior_scale_for_intercept = rep(0.5, 2),
e_prior_scale_for_aux = rep(5, 6),
b_prior_scale = rep(1, 4),
b_prior_df = rep(1, 4),
b_prior_regularization = 5
)
standata_jmwiv_CFfemale <- make_standata_jmwiv(
formulaLong1 = f_fev,
formulaLong2 = f_bmi,
dataLong = cfLONG_female,
id_var = "id",
time_varLong = "age_minus18",
formulaEvent = f_Event,
dataEvent = cfSURV_female,
a_K = 4,
assoc = "CV",
basehaz = "bs",
basehaz_aux = list(df = 6, knots = NULL, degree = 3),
qnodes = 15L,
prior_list = prior_input)
stanc_mod <- stanc(here::here("STAN_functions/jm-wiv.stan"))
mod <- stan_model(stanc_ret = stanc_mod, model_name = "jmwiv_simple", verbose = FALSE)
CFfemale_jmwiv_CVassoc <- sampling(object = mod, data = standata_jmwiv_CFfemale,
cores = 2, chains = 2, warmup = 1000, iter = 2000, refresh = 0.2, seed = 856,
control = list(max_treedepth = 15))
Alternatively, the last two chunks can be run using rstanjmwiv::jmwiv_stan() — but this will not save the standata object.
CFfemale_jmwiv_CVassoc <- jmwiv_stan(
formulaLong1 = f_fev, formulaLong2 = f_bmi,
dataLong = cfLONG_female, id_var = "id", time_varLong = "age_minus18",
formulaEvent = f_Event, dataEvent = cfSURV_female,
a_K = 4, assoc = "CV", basehaz = "bs",
basehaz_aux = list(df = 6, knots = NULL, degree = 3),
qnodes = 15L, prior_list = prior_input,
cores = 2, chains = 2, warmup = 1000, iter = 2000, seed = 856,
init_r = 1, control = list(max_treedepth = 15))
print(get_elapsed_time(CFfemale_jmwiv_CVassoc))
warmup sample
chain:1 35433.2 17009.6
chain:2 35170.3 15348.1
print(CFfemale_jmwiv_CVassoc, pars = c("y1mu_gamma", "y1sigma_gamma", "y2mu_gamma",
"y2sigma_gamma", "y1mu_beta", "y1sigma_beta", "y2mu_beta", "y2sigma_beta"),
probs = c(0.025, 0.975))
Inference for Stan model: jmwiv.
2 chains, each with iter=2000; warmup=1000; thin=1;
post-warmup draws per chain=1000, total post-warmup draws=2000.
mean se_mean sd 2.5% 97.5% n_eff Rhat
y1mu_gamma 1.83 0.00 0.01 1.81 1.86 43 1.03
y1sigma_gamma -1.55 0.00 0.01 -1.57 -1.53 744 1.00
y2mu_gamma 21.54 0.01 0.06 21.42 21.66 138 1.00
y2sigma_gamma 0.16 0.00 0.01 0.14 0.18 433 1.01
y1mu_beta[1] -0.04 0.00 0.00 -0.04 -0.04 800 1.00
y1mu_beta[2] 0.30 0.01 0.03 0.23 0.37 25 1.08
y1mu_beta[3] -0.23 0.00 0.03 -0.29 -0.17 57 1.02
y1sigma_beta[1] -0.02 0.00 0.00 -0.02 -0.02 1136 1.00
y1sigma_beta[2] 0.02 0.00 0.02 -0.03 0.06 1008 1.00
y1sigma_beta[3] 0.05 0.00 0.02 0.00 0.09 827 1.00
y2mu_beta[1] 0.05 0.00 0.00 0.04 0.05 1349 1.00
y2mu_beta[2] 0.87 0.02 0.13 0.61 1.14 64 1.03
y2mu_beta[3] -0.89 0.01 0.12 -1.15 -0.67 105 1.01
y2sigma_beta[1] 0.00 0.00 0.00 0.00 0.00 1930 1.00
y2sigma_beta[2] 0.02 0.00 0.02 -0.03 0.06 758 1.00
y2sigma_beta[3] -0.08 0.00 0.02 -0.12 -0.03 583 1.01
Samples were drawn using NUTS(diag_e) at Sun Jan 18 10:34:54 2026.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at
convergence, Rhat=1).
The reconstructed intercepts are:
print(CFfemale_jmwiv_CVassoc, pars = c("y1mu_Intercept", "y1sigma_Intercept",
"y2mu_Intercept", "y2sigma_Intercept"), probs = c(0.025, 0.975))
Inference for Stan model: jmwiv.
1 chains, each with iter=2000; warmup=1000; thin=1;
post-warmup draws per chain=1000, total post-warmup draws=1000.
mean se_mean sd 2.5% 97.5% n_eff Rhat
y1mu_Intercept 2.19 0.01 0.03 2.13 2.23 21 1.01
y1sigma_Intercept -1.39 0.00 0.02 -1.43 -1.35 564 1.00
y2mu_Intercept 21.14 0.01 0.10 20.96 21.33 91 1.00
y2sigma_Intercept 0.19 0.00 0.02 0.15 0.24 433 1.00
Samples were drawn using NUTS(diag_e) at Sun Jan 18 02:58:17 2026.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at
convergence, Rhat=1).
Here are the association terms:
print(CFfemale_jmwiv_CVassoc, pars = "a_beta", probs = c(0.025, 0.975))
Inference for Stan model: jmwiv.
2 chains, each with iter=2000; warmup=1000; thin=1;
post-warmup draws per chain=1000, total post-warmup draws=2000.
mean se_mean sd 2.5% 97.5% n_eff Rhat
a_beta[1] -2.14 0.00 0.12 -2.39 -1.91 1408 1
a_beta[2] 2.69 0.02 0.54 1.61 3.77 884 1
a_beta[3] -0.15 0.00 0.03 -0.21 -0.09 644 1
a_beta[4] 0.38 0.01 0.13 0.10 0.63 704 1
Samples were drawn using NUTS(diag_e) at Sun Jan 18 10:34:54 2026.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at
convergence, Rhat=1).
print(CFfemale_jmwiv_CVassoc, pars = c("e_Intercept", "e_beta"), probs = c(0.025, 0.975))
Inference for Stan model: jmwiv.
2 chains, each with iter=2000; warmup=1000; thin=1;
post-warmup draws per chain=1000, total post-warmup draws=2000.
mean se_mean sd 2.5% 97.5% n_eff Rhat
e_Intercept 0.07 0 0.06 -0.05 0.19 3611 1
e_beta[1] -0.11 0 0.09 -0.29 0.06 3352 1
e_beta[2] -0.04 0 0.09 -0.22 0.12 3549 1
Samples were drawn using NUTS(diag_e) at Sun Jan 18 10:34:54 2026.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at
convergence, Rhat=1).
print(CFfemale_jmwiv_CVassoc, pars = "b_sd", probs = c(0.025, 0.975))
Inference for Stan model: jmwiv.
2 chains, each with iter=2000; warmup=1000; thin=1;
post-warmup draws per chain=1000, total post-warmup draws=2000.
mean se_mean sd 2.5% 97.5% n_eff Rhat
b_sd[1] 0.78 0 0.01 0.76 0.80 118 1.01
b_sd[2] 0.45 0 0.01 0.43 0.47 838 1.00
b_sd[3] 3.24 0 0.05 3.15 3.34 254 1.00
b_sd[4] 0.46 0 0.01 0.44 0.48 751 1.01
Samples were drawn using NUTS(diag_e) at Sun Jan 18 10:34:54 2026.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at
convergence, Rhat=1).
summary(CFfemale_jmwiv_CVassoc)$summary[c("a_beta[1]", "a_beta[2]", "a_beta[3]",
"a_beta[4]"), c(1, 4, 8)] %>%
multiply_by(c(1/10, 1/10, 1, 1)) %>%
exp()
mean 2.5% 97.5%
a_beta[1] 0.8073329 0.7876352 0.8264469
a_beta[2] 1.3084585 1.1741484 1.4572759
a_beta[3] 0.8639766 0.8136306 0.9167155
a_beta[4] 1.4649302 1.1090392 1.8688453
ranef_corr_jmwiv(CFfemale_jmwiv_CVassoc)
[,1] [,2] [,3] [,4]
[1,] 1.000000000 0.1439591359 0.4678737197 -0.002110608
[2,] 0.143959136 1.0000000000 0.0004370978 0.254622966
[3,] 0.467873720 0.0004370978 1.0000000000 0.485507917
[4,] -0.002110608 0.2546229657 0.4855079166 1.000000000
The posterior predictive checks for the longitudinal submodel are useful to assess the general fit.
list_of_draws <- rstan::extract(CFfemale_jmwiv_CVassoc)
ppdata <- replicate(10, ppcheck(list_of_draws = list_of_draws, data = cfLONG_female,
x = "age_minus18", idvar = "id"), simplify = FALSE)
library(bayesplot)
y1_rep <- sapply(ppdata, "[[", 1) %>% t()
y2_rep <- sapply(ppdata, "[[", 2) %>% t()
ppc_dens_overlay(cfLONG_female$gli_fev, y1_rep)
ppc_dens_overlay(cfLONG_female$bmi, y2_rep)
The function below returns the predictions for two randomly selected women.
pred_plot_jmwiv(stanfit = CFfemale_jmwiv_CVassoc, standata = standata_jmwiv_CFfemale,
var_labels = c(var_long_labels, "Survival probability"), id_selected = id_sel,
plot_jm = FALSE) + labs(x = "Age") +
scale_x_continuous(breaks = seq(0, 32, by = 5), labels = 18 + seq(0, 32, by = 5)) +
theme(strip.text.x = element_blank())
sessionInfo()
R version 4.3.3 (2024-02-29 ucrt)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows Server 2022 x64 (build 20348)
locale:
[1] LC_COLLATE=English_United Kingdom.utf8
[2] LC_CTYPE=English_United Kingdom.utf8
[3] LC_MONETARY=English_United Kingdom.utf8
[4] LC_NUMERIC=C
[5] LC_TIME=English_United Kingdom.utf8
time zone: Europe/London
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] bayesplot_1.11.1 DiagrammeR_1.0.11 lubridate_1.9.3
[4] survival_3.5-8 stringr_1.5.1 gridExtra_2.3
[7] magrittr_2.0.3 pipeR_0.6.1.3 tidyr_1.3.1
[10] dplyr_1.1.4 ggsurvfit_1.0.0 ggridges_0.5.6
[13] ggpubr_0.6.0 ggplot2_3.5.0 brms_2.20.4
[16] Rcpp_1.0.12 rstan_2.32.5 StanHeaders_2.32.5
[19] pacman_0.5.1 rstanjmwiv_1.0.0.0000