Analysis on the UK CF Registry

A Bayesian location-scale joint model for time-to-event and multivariate longitudinal data with association based on within-individual variability
Marco Palma, Omar El Makkaoui
May 2026

This is the code for the analysis of the UK Cystic Fibrosis Registry data in the paper "A Bayesian location-scale joint model for time-to-event and multivariate longitudinal data with association based on within-individual variability" by Palma et al. (2026).

Introduction

R packages

library(pacman)
p_load(rstan, brms)
p_load(ggplot2, ggpubr, ggridges, ggsurvfit, dplyr, tidyr, pipeR, magrittr, gridExtra)
p_load(stringr, survival, lubridate, DiagrammeR)
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())

# library(devtools) install_github('marcopalma3/rstanjmwiv')
library(rstanjmwiv)
theme_set(theme_bw(base_size = 14) + theme(panel.grid.minor = element_blank(),
    plot.margin = margin(t = 6, r = 6, b = 6, l = 6, unit = "pt"),
    legend.position = "bottom", legend.margin = margin(0, 0, 0, 0, unit = "pt"),
    legend.box.margin = margin(0, 0, 0, 0, unit = "pt")))

"%ni%" <- Negate("%in%")
options(digits = 3)

var_long       <- c("gli_fev", "bmi")
var_long_labels <- c("FEV1", "BMI")

Dataset

Import the dataset and store it in the cfdata object.

cfdata_clean <- cfdata %>%
    mutate(s01encounterdate = lubridate::dmy(s01encounterdate),
           dmg_dateofdeath  = lubridate::dmy(dmg_dateofdeath),
           dmg_dateofbirth  = lubridate::dmy(dmg_dateofbirth)) %>%
    mutate_at(c("patient_id","sitecode_anon","dmg_sex","dmg_ethnicity_gli",
                "s01isthisanannualreviewencounter","s05culturespeciesstaph",
                "s05culturespeciespseudoaeruginos","s07pancreaticenzymesupplements"),
              list(~factor(.)))

rm(cfdata)

The number of rows is 166117 and the number of variables is 1357. The number of unique patient IDs is 13899.

Subset used for the analysis — inclusion criteria

We have kept only the first visit per year per each patient. To account for all transplants, you need to first compute transplanted indicators, then remove missing values (often, in the year of transplant gli_fev is missing).

  • Filter longitudinal observations between the age of 18 and 50;
  • Keep only the first observation in each year (in case they have more than one annual review);
  • Compute transplant number and keep only the longitudinal observations before the first transplant.
explanatory_before <- c("patient_id","age","dmg_sex","dmg_dateofbirth","f508_class","dmg_ageatdiagnosis")
dependent <- var_long

cf2 <- cfdata_clean %>>%
  (~ c(nrow(.), length(unique(.$patient_id))) -> a1) %>>%
  filter(age >= 18 & age <= 50) %>>%
  (~ c(nrow(.), length(unique(.$patient_id))) -> a2) %>>%
  group_by(patient_id) %>% filter(year != lag(year) | row_number() == 1) %>>%
  (~ c(nrow(.), length(unique(.$patient_id))) -> a3) %>>%
  group_by(patient_id) %>%
  mutate(Transplanted = cumsum(s10transplantsincelastannualrevi == "Y"),
         EverTx = as.integer(max(Transplanted) > 0)) %>%
  filter(Transplanted == 0) %>%
  mutate(Age_LastEncounter  = max(age),
         Date_LastEncounter = max(s01encounterdate)) %>>%
  (~ c(nrow(.), length(unique(.$patient_id))) -> a4)

cf2_bypatient <- cf2 %>%
  select(patient_id, dmg_dateofbirth, dmg_sex, dmg_ageatdiagnosis, f508_class) %>%
  distinct(across(everything())) %>%
  drop_na() %>%
  mutate("diagnosedafter1yo" = factor(ifelse(dmg_ageatdiagnosis > 1, 1, 0)),
         "f508_homozygous"   = factor(ifelse(f508_class == "Homoz", 1, 0)),
         "yob"               = year(dmg_dateofbirth))

cf2 <- left_join(cf2,
                 select(cf2_bypatient, patient_id, diagnosedafter1yo, f508_homozygous, yob),
                 by = "patient_id")

explanatory_after <- c("patient_id","age","dmg_sex","diagnosedafter1yo","f508_homozygous","yob")

cfLONG <- cf2 %>%
  filter(!!sym(var_long[1]) > 0) %>%
  filter(!!sym(var_long[2]) > 0 & !!sym(var_long[2]) < 80) %>%
  drop_na(all_of(explanatory_after)) %>>%
  (~ c(nrow(.), length(unique(.$patient_id))) -> a5)

cfLONG$patient_id <- droplevels(cfLONG$patient_id)
cfLONG <- ungroup(cfLONG)

Please note, for some people the date of death is recorded, but not the age at death.

Event time is equal to:

  • death date — if the person died and never received transplant;
  • last encounter before (first) transplant — if the person received a transplant;
  • last date recorded on the dataset — if the person was not dead and never received a transplant.

Death status is equal to 1 if the person died and never received transplant, 0 otherwise (censored).

last_deathdate_recorded <- max(cf2$dmg_dateofdeath, na.rm = TRUE)

cfSURV <- cf2 %>%
  group_by(patient_id) %>%
  filter(row_number() == 1) %>%
  select(patient_id, dmg_sex, dmg_dateofbirth, dmg_dateofdeath, dmg_ageatdeath,
         diagnosedafter1yo, f508_homozygous, yob, EverTx, Date_LastEncounter) %>%
  mutate(event_time = case_when(
           !is.na(dmg_dateofdeath) && EverTx == 0 ~ dmg_dateofdeath,
           EverTx == 1                             ~ Date_LastEncounter,
           .default = last_deathdate_recorded),
         death_status = ifelse(!is.na(dmg_dateofdeath) && EverTx == 0, 1, 0),
         ageatevent   = max(18, as.numeric(interval(dmg_dateofbirth, event_time), "years"))) %>%
  filter(patient_id %in% unique(cfLONG$patient_id)) %>%
  ungroup()

CF Female dataset — Administrative censoring at 50

  • Introduce censoring at 50 years old if they had not experienced the event by then.
  • Introduce censoring at age at last annual review before transplant.
female_subsample <- cfLONG %>%
  filter(dmg_sex == "F") %>%
  pull(patient_id) %>%
  unique()

cfLONG_female <- cfLONG %>%
  filter(patient_id %in% female_subsample) %>%
  mutate(patient_id  = droplevels(patient_id),
         id          = as.integer(patient_id),
         age_minus18 = age - 18,
         bmi10       = bmi / 10) %>%
  ungroup()

cfLONG_female %>%
  group_by(patient_id) %>%
  summarise(no_encounters = n()) %>%
  pull(no_encounters) %>%
  quantile(., c(0.25, 0.5, 0.75))
25% 50% 75% 
  4   6  10
cat("The number of individuals in the dataset is", nrow(cfSURV_female),
    ", for a total number of annual visits of", nrow(cfLONG_female))
The number of individuals in the dataset is 3012, for a total number of annual visits of 21910
cfSURV_female <- cfSURV %>%
  filter(patient_id %in% female_subsample) %>%
  mutate(patient_id              = droplevels(patient_id),
         id                      = as.integer(patient_id),
         yob80                   = yob - 1980,
         AC50_ageatevent_minus18 = ifelse(ageatevent > 50, 50 - 18, ageatevent - 18),
         AC50_ageatevent_minus18 = ifelse(AC50_ageatevent_minus18 > 0, AC50_ageatevent_minus18, 0),
         AC50_death_status       = ifelse(ageatevent > 50 & death_status == 1, 0, death_status)) %>%
  ungroup()

cfSURV_female %>%
  survfit2(Surv(AC50_ageatevent_minus18, AC50_death_status) ~ diagnosedafter1yo, data = .) %>%
  ggsurvfit::ggsurvfit() + add_confidence_interval() + add_risktable() +
  ylim(c(0,1)) + scale_x_continuous(breaks = seq(0,32,4)) +
  labs(x = "years", y = "Overall survival probability")

cfSURV_female %>%
  survfit2(Surv(AC50_ageatevent_minus18, AC50_death_status) ~ f508_homozygous, data = .) %>%
  ggsurvfit::ggsurvfit() + add_confidence_interval() + add_risktable() +
  ylim(c(0,1)) + scale_x_continuous(breaks = seq(0,32,4)) +
  labs(x = "years", y = "Overall survival probability")

cfSURV_female %>%
  survfit2(Surv(AC50_ageatevent_minus18, AC50_death_status) ~ f508_homozygous, data = .) %>%
  ggsurvfit::ggsurvfit() + add_confidence_interval() + add_risktable() +
  ylim(c(0,1)) + scale_x_continuous(breaks = seq(0,32,4)) +
  labs(x = "years", y = "Overall survival probability")
Annual encounters bar chart
FEV1 smooth by covariate groups
BMI smooth by covariate groups
KM by f508_homozygous
Figure S1: KM curves by F508 3 groups

JM-WIV for females (CV association) — Fit model

f_fev <- list(formula(gli_fev ~ age_minus18 + diagnosedafter1yo + f508_homozygous + (1 | id)),
              formula(sigma   ~ age_minus18 + diagnosedafter1yo + f508_homozygous + (1 | id)))

f_bmi <- list(formula(bmi   ~ age_minus18 + diagnosedafter1yo + f508_homozygous + (1 | id)),
              formula(sigma  ~ age_minus18 + diagnosedafter1yo + f508_homozygous + (1 | id)))

f_Event <- survival::Surv(AC50_ageatevent_minus18, AC50_death_status) ~
             diagnosedafter1yo + f508_homozygous

prior_input <- list(
  # y1mu_prior_mean = NULL,   y2mu_prior_mean = NULL,
  # y1sigma_prior_mean = NULL, y2sigma_prior_mean = NULL,
  # e_prior_mean = NULL,       a_prior_mean = rep(0, 4),
  # ymu_prior_mean_for_intercept = NULL,
  # ysigma_prior_mean_for_intercept = NULL,
  # e_prior_mean_for_aux = NULL,
  # y1mu_prior_scale = 5,      y2mu_prior_scale = 5,
  # y1sigma_prior_scale = 5,   y2sigma_prior_scale = 5,
  # e_prior_scale = 5,         a_prior_scale = rep(2.5, 4),
  # ymu_prior_scale_for_intercept = rep(0.5, 2),
  # ysigma_prior_scale_for_intercept = rep(0.5, 2),
  e_prior_scale_for_aux  = rep(5, 6),
  b_prior_scale          = rep(1, 4),
  b_prior_df             = rep(1, 4),
  b_prior_regularization = 5
)

standata_jmwiv_CFfemale <- make_standata_jmwiv(
  formulaLong1 = f_fev,
  formulaLong2 = f_bmi,
  dataLong     = cfLONG_female,
  id_var       = "id",
  time_varLong = "age_minus18",
  formulaEvent = f_Event,
  dataEvent    = cfSURV_female,
  a_K          = 4,
  assoc        = "CV",
  basehaz      = "bs",
  basehaz_aux  = list(df = 6, knots = NULL, degree = 3),
  qnodes       = 15L,
  prior_list   = prior_input)
stanc_mod <- stanc(here::here("STAN_functions/jm-wiv.stan"))
mod <- stan_model(stanc_ret = stanc_mod, model_name = "jmwiv_simple", verbose = FALSE)

CFfemale_jmwiv_CVassoc <- sampling(object = mod, data = standata_jmwiv_CFfemale,
    cores = 2, chains = 2, warmup = 1000, iter = 2000, refresh = 0.2, seed = 856,
    control = list(max_treedepth = 15))

Alternatively, the last two chunks can be run using rstanjmwiv::jmwiv_stan() — but this will not save the standata object.

CFfemale_jmwiv_CVassoc <- jmwiv_stan(
    formulaLong1 = f_fev, formulaLong2 = f_bmi,
    dataLong     = cfLONG_female, id_var = "id", time_varLong = "age_minus18",
    formulaEvent = f_Event, dataEvent = cfSURV_female,
    a_K = 4, assoc = "CV", basehaz = "bs",
    basehaz_aux = list(df = 6, knots = NULL, degree = 3),
    qnodes = 15L, prior_list = prior_input,
    cores = 2, chains = 2, warmup = 1000, iter = 2000, seed = 856,
    init_r = 1, control = list(max_treedepth = 15))

JM-WIV for females (CV association) — Results

print(get_elapsed_time(CFfemale_jmwiv_CVassoc))
         warmup  sample
chain:1 35433.2 17009.6
chain:2 35170.3 15348.1
print(CFfemale_jmwiv_CVassoc, pars = c("y1mu_gamma", "y1sigma_gamma", "y2mu_gamma",
    "y2sigma_gamma", "y1mu_beta", "y1sigma_beta", "y2mu_beta", "y2sigma_beta"),
    probs = c(0.025, 0.975))
Inference for Stan model: jmwiv.
2 chains, each with iter=2000; warmup=1000; thin=1;
post-warmup draws per chain=1000, total post-warmup draws=2000.

                 mean se_mean   sd  2.5% 97.5% n_eff Rhat
y1mu_gamma       1.83    0.00 0.01  1.81  1.86    43 1.03
y1sigma_gamma   -1.55    0.00 0.01 -1.57 -1.53   744 1.00
y2mu_gamma      21.54    0.01 0.06 21.42 21.66   138 1.00
y2sigma_gamma    0.16    0.00 0.01  0.14  0.18   433 1.01
y1mu_beta[1]    -0.04    0.00 0.00 -0.04 -0.04   800 1.00
y1mu_beta[2]     0.30    0.01 0.03  0.23  0.37    25 1.08
y1mu_beta[3]    -0.23    0.00 0.03 -0.29 -0.17    57 1.02
y1sigma_beta[1] -0.02    0.00 0.00 -0.02 -0.02  1136 1.00
y1sigma_beta[2]  0.02    0.00 0.02 -0.03  0.06  1008 1.00
y1sigma_beta[3]  0.05    0.00 0.02  0.00  0.09   827 1.00
y2mu_beta[1]     0.05    0.00 0.00  0.04  0.05  1349 1.00
y2mu_beta[2]     0.87    0.02 0.13  0.61  1.14    64 1.03
y2mu_beta[3]    -0.89    0.01 0.12 -1.15 -0.67   105 1.01
y2sigma_beta[1]  0.00    0.00 0.00  0.00  0.00  1930 1.00
y2sigma_beta[2]  0.02    0.00 0.02 -0.03  0.06   758 1.00
y2sigma_beta[3] -0.08    0.00 0.02 -0.12 -0.03   583 1.01

Samples were drawn using NUTS(diag_e) at Sun Jan 18 10:34:54 2026.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at
convergence, Rhat=1).

The reconstructed intercepts are:

print(CFfemale_jmwiv_CVassoc, pars = c("y1mu_Intercept", "y1sigma_Intercept",
    "y2mu_Intercept", "y2sigma_Intercept"), probs = c(0.025, 0.975))
Inference for Stan model: jmwiv.
1 chains, each with iter=2000; warmup=1000; thin=1;
post-warmup draws per chain=1000, total post-warmup draws=1000.

                   mean se_mean   sd  2.5% 97.5% n_eff Rhat
y1mu_Intercept     2.19    0.01 0.03  2.13  2.23    21 1.01
y1sigma_Intercept -1.39    0.00 0.02 -1.43 -1.35   564 1.00
y2mu_Intercept    21.14    0.01 0.10 20.96 21.33    91 1.00
y2sigma_Intercept  0.19    0.00 0.02  0.15  0.24   433 1.00

Samples were drawn using NUTS(diag_e) at Sun Jan 18 02:58:17 2026.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at
convergence, Rhat=1).

Here are the association terms:

print(CFfemale_jmwiv_CVassoc, pars = "a_beta", probs = c(0.025, 0.975))
Inference for Stan model: jmwiv.
2 chains, each with iter=2000; warmup=1000; thin=1;
post-warmup draws per chain=1000, total post-warmup draws=2000.

           mean se_mean   sd  2.5% 97.5% n_eff Rhat
a_beta[1] -2.14    0.00 0.12 -2.39 -1.91  1408    1
a_beta[2]  2.69    0.02 0.54  1.61  3.77   884    1
a_beta[3] -0.15    0.00 0.03 -0.21 -0.09   644    1
a_beta[4]  0.38    0.01 0.13  0.10  0.63   704    1

Samples were drawn using NUTS(diag_e) at Sun Jan 18 10:34:54 2026.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at
convergence, Rhat=1).
print(CFfemale_jmwiv_CVassoc, pars = c("e_Intercept", "e_beta"), probs = c(0.025, 0.975))
Inference for Stan model: jmwiv.
2 chains, each with iter=2000; warmup=1000; thin=1;
post-warmup draws per chain=1000, total post-warmup draws=2000.

             mean se_mean   sd  2.5% 97.5% n_eff Rhat
e_Intercept  0.07       0 0.06 -0.05  0.19  3611    1
e_beta[1]   -0.11       0 0.09 -0.29  0.06  3352    1
e_beta[2]   -0.04       0 0.09 -0.22  0.12  3549    1

Samples were drawn using NUTS(diag_e) at Sun Jan 18 10:34:54 2026.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at
convergence, Rhat=1).
print(CFfemale_jmwiv_CVassoc, pars = "b_sd", probs = c(0.025, 0.975))
Inference for Stan model: jmwiv.
2 chains, each with iter=2000; warmup=1000; thin=1;
post-warmup draws per chain=1000, total post-warmup draws=2000.

        mean se_mean   sd 2.5% 97.5% n_eff Rhat
b_sd[1] 0.78       0 0.01 0.76  0.80   118 1.01
b_sd[2] 0.45       0 0.01 0.43  0.47   838 1.00
b_sd[3] 3.24       0 0.05 3.15  3.34   254 1.00
b_sd[4] 0.46       0 0.01 0.44  0.48   751 1.01

Samples were drawn using NUTS(diag_e) at Sun Jan 18 10:34:54 2026.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at
convergence, Rhat=1).
summary(CFfemale_jmwiv_CVassoc)$summary[c("a_beta[1]", "a_beta[2]", "a_beta[3]",
    "a_beta[4]"), c(1, 4, 8)] %>%
    multiply_by(c(1/10, 1/10, 1, 1)) %>%
    exp()
               mean      2.5%     97.5%
a_beta[1] 0.8073329 0.7876352 0.8264469
a_beta[2] 1.3084585 1.1741484 1.4572759
a_beta[3] 0.8639766 0.8136306 0.9167155
a_beta[4] 1.4649302 1.1090392 1.8688453
ranef_corr_jmwiv(CFfemale_jmwiv_CVassoc)
             [,1]         [,2]         [,3]         [,4]
[1,]  1.000000000 0.1439591359 0.4678737197 -0.002110608
[2,]  0.143959136 1.0000000000 0.0004370978  0.254622966
[3,]  0.467873720 0.0004370978 1.0000000000  0.485507917
[4,] -0.002110608 0.2546229657 0.4855079166  1.000000000

The posterior predictive checks for the longitudinal submodel are useful to assess the general fit.

list_of_draws <- rstan::extract(CFfemale_jmwiv_CVassoc)
ppdata <- replicate(10, ppcheck(list_of_draws = list_of_draws, data = cfLONG_female,
    x = "age_minus18", idvar = "id"), simplify = FALSE)
library(bayesplot)
y1_rep <- sapply(ppdata, "[[", 1) %>% t()
y2_rep <- sapply(ppdata, "[[", 2) %>% t()
ppc_dens_overlay(cfLONG_female$gli_fev, y1_rep)
ppc_dens_overlay(cfLONG_female$bmi, y2_rep)
PPC FEV1
PPC BMI

The function below returns the predictions for two randomly selected women.

pred_plot_jmwiv(stanfit = CFfemale_jmwiv_CVassoc, standata = standata_jmwiv_CFfemale,
    var_labels = c(var_long_labels, "Survival probability"), id_selected = id_sel,
    plot_jm = FALSE) + labs(x = "Age") +
    scale_x_continuous(breaks = seq(0, 32, by = 5), labels = 18 + seq(0, 32, by = 5)) +
    theme(strip.text.x = element_blank())
Individual predictions

Session info

sessionInfo()
R version 4.3.3 (2024-02-29 ucrt)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows Server 2022 x64 (build 20348)

locale:
[1] LC_COLLATE=English_United Kingdom.utf8
[2] LC_CTYPE=English_United Kingdom.utf8
[3] LC_MONETARY=English_United Kingdom.utf8
[4] LC_NUMERIC=C
[5] LC_TIME=English_United Kingdom.utf8

time zone: Europe/London

attached base packages:
[1] stats  graphics  grDevices  utils  datasets  methods  base

other attached packages:
 [1] bayesplot_1.11.1      DiagrammeR_1.0.11     lubridate_1.9.3
 [4] survival_3.5-8        stringr_1.5.1         gridExtra_2.3
 [7] magrittr_2.0.3        pipeR_0.6.1.3         tidyr_1.3.1
[10] dplyr_1.1.4           ggsurvfit_1.0.0       ggridges_0.5.6
[13] ggpubr_0.6.0          ggplot2_3.5.0         brms_2.20.4
[16] Rcpp_1.0.12           rstan_2.32.5          StanHeaders_2.32.5
[19] pacman_0.5.1          rstanjmwiv_1.0.0.0000