This is the code for the paper “Factors associated with within-individual variability of lung function for people with cystic fibrosis: a longitudinal registry study” by Palma et al., available as a preprint on medrxiv: https://www.medrxiv.org/content/10.1101/2023.05.12.23289768v1.
library(pacman)
p_load(dplyr, tidyr, ggplot2, lubridate, knitr, kableExtra, stringr, ggpubr, ggridges) #, devtools
p_load(pipeR, DiagrammeR, DiagrammeRsvg, magrittr, rsvg)
p_load(brms, brmsmargins, posterior, bayesplot, tidybayes, survival, emmeans)
theme_set(theme_light(base_size = 14) + theme(panel.grid.minor = element_blank(),
plot.margin = margin(t = 6, r = 6, b = 6, l = 6, unit = "pt"), legend.position = "bottom",
legend.margin = margin(0, 0, 0, 0, unit = "pt"), legend.box.margin = margin(0,
0, 0, 0, unit = "pt")))
"%ni%" <- Negate("%in%")
var_long <- "gli_fev"
Import the dataset and store it in the cfdata
object.
cfdata_clean <- cfdata %>%
mutate(s01encounterdate = lubridate::dmy(s01encounterdate), dmg_dateofdeath = lubridate::dmy(dmg_dateofdeath),
dmg_dateofbirth = lubridate::dmy(dmg_dateofbirth)) %>%
mutate_at(c("patient_id", "sitecode_anon", "dmg_sex", "dmg_ethnicity_gli", "s01isthisanannualreviewencounter",
"s05culturespeciesstaph", "s05culturespeciespseudoaeruginos", "s07pancreaticenzymesupplements"),
list(~factor(.)))
rm(cfdata)
The number of rows is 166117 and the number of variables is 1357. The number of unique patient ID is 13899.
We have kept only the first visit per year per each patient. To
account for all transplants, you need to first compute transplanted
indicators, then remove missing values (often, in the year of transplant
gli_fev
is missing).
explanatory_before <- c("patient_id", "age", "dmg_sex", "dmg_ageatdiagnosis", "dmg_dateofbirth", "f508_class")
cf2 <- cfdata_clean %>>%
(~ c(nrow(.), length(unique(.$patient_id))) -> a1) %>>%
filter(age >= 18 & age <= 50) %>>%
(~ c(nrow(.), length(unique(.$patient_id))) -> a2) %>>%
group_by(patient_id) %>% filter(year != lag(year) | row_number() == 1) %>>%
(~ c(nrow(.), length(unique(.$patient_id))) -> a3) %>>%
group_by(patient_id) %>%
mutate(Transplanted = cumsum(s10transplantsincelastannualrevi == "Y")) %>%
filter(Transplanted == 0) %>>%
(~ c(nrow(.), length(unique(.$patient_id))) -> a4)
cf2_bypatient <- cf2 %>%
select(patient_id, dmg_dateofbirth, dmg_sex, dmg_ageatdiagnosis, f508_class) %>%
distinct(across(everything())) %>%
drop_na() %>% ##add_count(patient_id, name = "no_records")
mutate("diagnosedafter1yo" = factor(ifelse(dmg_ageatdiagnosis>1, 1, 0)),
"f508_homozygous" = factor(ifelse(f508_class == "Homoz", 1, 0)),
"yob" = year(dmg_dateofbirth))
cf2 <- left_join(cf2,
select(cf2_bypatient, patient_id, diagnosedafter1yo, f508_homozygous, yob),
by = "patient_id")
explanatory_after <- c("patient_id", "age", "dmg_sex", "diagnosedafter1yo", "dmg_dateofbirth", "f508_homozygous")
dependent <- var_long
cf2%>%
finalfit::missing_pattern(dependent, explanatory_before, rotate.names = TRUE)
## patient_id age dmg_dateofbirth dmg_sex f508_class gli_fev
## 50530 1 1 1 1 1 1
## 17878 1 1 1 1 1 1
## 3223 1 1 1 1 1 0
## 1573 1 1 1 1 1 0
## 0 0 0 0 0 4796
## dmg_ageatdiagnosis
## 50530 1 0
## 17878 0 1
## 3223 1 1
## 1573 0 2
## 19451 24247
cf2 <- cf2 %>%
filter(!!sym(var_long) > 0) %>%
drop_na(all_of(explanatory_after)) %>>%
(~ c(nrow(.), length(unique(.$patient_id))) -> a5)
inclusion_diagram <- DiagrammeR::grViz("digraph {
graph [layout = dot, rankdir = TB]
node [shape = rectangle, fixedsize=true, width=4]
rec1 [label = 'Original dataset\\n(N = @@6, n = @@1)']
rec2 [label = 'Adults (18-49 years old)\\n(N = @@7, n = @@2)']
rec3 [label = 'First review every year\\n(N = @@8, n = @@3)']
rec4 [label = 'Annual reviews before lung transplant\\n(N = @@9, n = @@4)']
rec5 [label = 'Non missing values of FEV1 and covariates\\n(N = @@10, n = @@5)']
# edge definitions with the node IDs
rec1 -> rec2 -> rec3 -> rec4 -> rec5
}
[1]: a1
[2]: a2
[3]: a3
[4]: a4
[5]: a5
[6]: a1[2]
[7]: a2[2]
[8]: a3[2]
[9]: a4[2]
[10]: a5[2]
")
inclusion_diagram
inclusion_diagram %>>% print(.) %>>% export_svg %>%
charToRaw %>%
rsvg_pdf("CFAdult_inclusiondiagram.pdf", width = 350)
p1 <- cf2[1:3971,] %>% #3971
drop_na(age, all_of(var_long)) %>%
ggplot(data = .,
aes_string(x = "age", y = var_long, group = "patient_id" #,color = "grey1"
)
) +
geom_point(alpha = 0.05) +
geom_line(alpha = 0.05) +
geom_smooth(data = cf2,
aes(x = age, y = get(var_long), group = dmg_sex, color = dmg_sex, fill = dmg_sex, linetype = dmg_sex),
size = 1.25) +
theme(legend.position = "bottom" ,
axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
plot.margin = margin(t = 6, r = 6, b = 0, l = 6,
unit = "pt")
) +
scale_x_continuous(limits = c(18,50),
breaks = scales::pretty_breaks(n = 10),
expand = expansion(add = 0)) +
scale_y_continuous(limits = c(0, NA), expand = expansion(add = 0.01)) +
scale_color_manual(breaks = c("F", "M"), values = c("#F8766D", "#001D81")) +
scale_fill_manual(breaks = c("F", "M"), values = c("#F8766D", "#001D81")) +
scale_linetype_manual(breaks = c("F", "M"), values = c("81", "11"),
na.value = "solid") +
labs(x = "Age",
y = bquote(FEV[1]),
color = "Sex",
fill = "Sex",
linetype = "Sex"
)
p2 <- ggplot(cf2) +
geom_density(aes(age, after_stat(count), fill = dmg_sex),
position = "fill", alpha = 0.5, adjust = 1/2) +
stat_ecdf(aes(age, color = dmg_sex, linetype = dmg_sex),
geom = "step", pad = FALSE, lwd = 1.25) +
geom_hline(aes(yintercept = 0.5), lty = 2) +
labs(x = "Age",
y = "Sex distribution",
fill = "Sex", linetype = "Sex") +
scale_x_continuous(limits = c(18,50),
breaks = scales::pretty_breaks(n = 10),
expand = expansion(add = 0)) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 5),
labels = scales::percent,
expand = expansion(add = 0)) +
scale_color_manual(breaks = c("F", "M"), values = c("#F8766D", "#001D81")) +
scale_fill_manual(breaks = c("F", "M"), values = c("#F8766D", "#001D81")) +
scale_linetype_manual(breaks = c("F", "M"), values = c("81", "11"),
na.value = "solid") +
theme(legend.position="none",
plot.margin = margin(t = 0, r = 6, b = 6, l = 6,
unit = "pt")
)
ggpubr::ggarrange(p1, NULL, p2,
heights = c(1.2, 0.05, 1),
ncol = 1, nrow = 3, align = "v",
common.legend = TRUE, legend = "bottom")
rm(p1,p2)
cf2 %>%
group_by(patient_id) %>%
summarise(no_encounters = n()) %>%
ggplot(., aes(x = no_encounters)) + geom_bar(width = 0.5) + scale_x_continuous(limits = c(0,
NA), breaks = scales::pretty_breaks(n = 8), expand = expansion(add = 0)) + scale_y_continuous(limits = c(0,
NA), breaks = seq(0, 2000, by = 100), expand = expansion(add = 10)) + labs(x = "Annual reviews",
y = "Number of individuals")
tab1_prep <- cf2 %>%
group_by(patient_id) %>%
summarise(dmg_sex, no_encounters = n(), ageatdiagnosis = first(dmg_ageatdiagnosis),
diagnosedafter1yo = first(diagnosedafter1yo), f508_homozygous = first(f508_homozygous),
ageatfirstvisit = first(age), ageatlastvisit = last(age), death_status = !is.na(dmg_dateofdeath),
yearofbirth = first(yob), first_fev = first(!!sym(var_long)), last_fev = last(!!sym(var_long))) %>%
distinct(across(everything()))
quantile_interval <- function(x) {
qts <- round(quantile(x, probs = c(0.25, 0.5, 0.75), na.rm = T), 2)
paste0(qts[2], " (", qts[1], "; ", qts[3], ")")
}
tab1_tot <- tab1_prep %>%
mutate(dmg_sex = "Total") %>%
group_by(dmg_sex) %>%
summarise(`Number of individuals` = n_distinct(patient_id), `Median number of encounters per individual (IQR)` = quantile_interval(no_encounters),
`Difference in years between first and last encounter date (IQR)` = quantile_interval(ageatlastvisit -
ageatfirstvisit), `Median age at diagnosis in years (IQR)` = quantile_interval(ageatdiagnosis),
`No of individuals diagnosed within the first year after birth (%)` = paste0(sum(diagnosedafter1yo ==
0, na.rm = T), " (", round(100 * mean(diagnosedafter1yo == 0, na.rm = T),
2), ")"), `No of individuals homozygous for F508 (%)` = paste0(sum(f508_homozygous ==
1, na.rm = T), " (", round(100 * mean(f508_homozygous == 1, na.rm = T),
2), ")"), `No of individuals with one FEV1 measure (%)` = paste0(sum(no_encounters ==
1), " (", round(100 * mean(no_encounters == 1), 2), ")"), `FEV1 at first encounter (IQR)` = quantile_interval(first_fev),
`FEV1 at last encounter (IQR)` = quantile_interval(last_fev), `Number of deaths (%)` = paste0(sum(death_status ==
1), " (", round(100 * mean(death_status == 1), 2), ")"))
tab1 <- tab1_prep %>%
group_by(dmg_sex) %>%
summarise(`Number of individuals` = n_distinct(patient_id), `Median number of encounters per individual (IQR)` = quantile_interval(no_encounters),
`Difference in years between first and last encounter date (IQR)` = quantile_interval(ageatlastvisit -
ageatfirstvisit), `Median age at diagnosis in years (IQR)` = quantile_interval(ageatdiagnosis),
`No of individuals diagnosed within the first year after birth (%)` = paste0(sum(diagnosedafter1yo ==
0, na.rm = T), " (", round(100 * mean(diagnosedafter1yo == 0, na.rm = T),
2), ")"), `No of individuals homozygous for F508 (%)` = paste0(sum(f508_homozygous ==
1, na.rm = T), " (", round(100 * mean(f508_homozygous == 1, na.rm = T),
2), ")"), `No of individuals with one FEV1 measure (%)` = paste0(sum(no_encounters ==
1), " (", round(100 * mean(no_encounters == 1), 2), ")"), `FEV1 at first encounter (IQR)` = quantile_interval(first_fev),
`FEV1 at last encounter (IQR)` = quantile_interval(last_fev), `Number of deaths (%)` = paste0(sum(death_status ==
1), " (", round(100 * mean(death_status == 1), 2), ")"))
rbind(tab1, tab1_tot) %>%
t() %>%
kbl() %>%
kable_paper("hover", html_font = "helvetica", full_width = F)
dmg_sex | F | M | Total |
Number of individuals | 3287 | 3814 | 7099 |
Median number of encounters per individual (IQR) | 8 (5; 12) | 9 (5; 13) | 9 (5; 13) |
Difference in years between first and last encounter date (IQR) | 8 (4.15; 12.62) | 8.87 (4.83; 14.06) | 8.47 (4.53; 13.46) |
Median age at diagnosis in years (IQR) | 0.67 (0.12; 4.04) | 0.53 (0.12; 3.97) | 0.58 (0.12; 3.99) |
No of individuals diagnosed within the first year after birth (%) | 1807 (54.97) | 2190 (57.42) | 3997 (56.29) |
No of individuals homozygous for F508 (%) | 1589 (48.34) | 1924 (50.45) | 3513 (49.47) |
No of individuals with one FEV1 measure (%) | 176 (5.35) | 191 (5.01) | 367 (5.17) |
FEV1 at first encounter (IQR) | 2.02 (1.41; 2.63) | 2.9 (1.98; 3.7) | 2.4 (1.64; 3.21) |
FEV1 at last encounter (IQR) | 1.49 (0.9; 2.28) | 2.15 (1.23; 3.23) | 1.8 (1.05; 2.78) |
Number of deaths (%) | 1028 (31.27) | 987 (25.88) | 2015 (28.38) |
Basic model (from Hedeker et al., 2008) : the submodel for the outcome mean includes as covariates age and sex with a random slope per subject; the submodel for the exponential of the standard deviation includes a random effect per subject. We also estimate the correlation between group-level effects.
\(y_{ij} = x_{ij}^\mathsf{T}\beta + \upsilon_i + \varepsilon_{ij} \qquad \qquad i = 1,\dots N \text{ subjects, } j = 1,\dots, n_i \text{ visits}\)
\(\varepsilon_{ij}\) independent of \(\upsilon_{i}, \; \forall i\)
between-subject variance: \(\upsilon_i \sim N(0, \sigma_{\upsilon_{i}}^2), \qquad \sigma_{\upsilon_{i}}^2 =\exp{(u_i^\mathsf{T}\alpha)}\)
within-subjects variance: \(\varepsilon_{ij} \sim N(0, \sigma_{\varepsilon_{ij}}^2), \qquad \sigma_{\varepsilon_{ij}}^2 = \exp{(w_{ij}^\mathsf{T}\tau + \omega_i)}\)
random effects are correlated via \(\rho\): \(\begin{pmatrix}\upsilon_i\\\omega_{i}\end{pmatrix} \sim N \begin{bmatrix} \begin{pmatrix}0\\ 0\end{pmatrix}\!\!, \begin{pmatrix} \sigma_{\upsilon}^2 & \rho\sigma_{\upsilon}\sigma_{\omega} \\ & \sigma_{\omega}^2 \end{pmatrix} \end{bmatrix}\)
For the complete model used in this analysis, see the Appendix of the paper.
By default, natural cubic regression spline (bs = ‘cr’) are defined by a modest sized set of knots spread evenly (by quantiles) through the covariate values. They are penalized by the conventional intergrated square second derivative cubic spline penalty (https://stat.ethz.ch/R-manual/R-devel/library/mgcv/html/smooth.terms.html). B-splines family (bs = ‘bs’, bs = ‘ps’, bs = ‘ad’) place knots evenly.
A list of the following objects:
bform
: it specifies the formula to be used in
brm
. Being a distributional model, we specify a model for
the outcome gli_fev
and one for the parameter
sigma
. The family of distributions is Gaussian.
prior_fitupdate
: it contains the prior
specifications for the model.
brms_fitprior
: it fits the model specified in
bform
with the prior specification in
prior_fitupdate
on the dataset data_fit
with
2400 iterations (of which 1200 warm-up). First, only the prior is
sampled (in case some prior checks are needed), than the model is
updated in brms_fit1
. Watanabe AIC and loo can be then
added (useful for model comparison).
The factor-smooth interaction in brms
, derived from the
mgcv
package, specifies a different smoothing function for
each sex with a centering constraint with respect to the model
intercept. Therefore, to account for sex mean differences, a main
parametric term for sex is also included ( https://stat.ethz.ch/Rmanual/R-devel/library/mgcv/html/gam.models.html).
###Note: this block takes a long time to run! Do not run unless you need to fit the model. You might also want to store the brms_fit1 model in a .rds file or a workspace.
seed1 <- 787878
bform <- bf(
gli_fev ~ dmg_sex + s(age, by = dmg_sex, bs = 'cr', k = 5)
+ diagnosedafter1yo
+ s(yob, bs = 'cr', k = 5)
+ f508_homozygous
+ (1 |ID| patient_id),
sigma ~ dmg_sex + s(age, by = dmg_sex, bs = 'cr', k = 5)
+ diagnosedafter1yo
+ s(yob, bs = 'cr', k = 5)
+ f508_homozygous
+ (1 |ID| patient_id),
family = gaussian()
)
prior_fitupdate <- c(prior(normal(2, 0.01), class = Intercept),
prior(normal(0, 0.01), class = b),
prior(exponential(10), class = sd),
prior(normal(0, 0.1), class = Intercept, dpar = sigma),
prior(normal(0, 0.1), class = b, dpar = sigma),
prior(exponential(10), class = sd, dpar = sigma),
prior(exponential(10), class = sds),
prior(exponential(10), class = sds, dpar = sigma),
prior(lkj(2), class = cor))
brms_fitprior <- brm(bform,
data = data_fit,
seed = seed1,
prior = prior_fitupdate,
warmup = 1200,
iter = 2400,
thin = 5,
chains = 2, cores = 2
, sample_prior = "only"
, control = list(adapt_delta = 0.95)
)
fit1_time <- system.time(
brms_fit1 <- update(brms_fitprior, sample_prior="no", cores = 2, seed = seed1)
)
brms_fit1 <- add_criterion(brms_fit1, c("waic", "loo"), cores = 2)
The package brms
uses only complete cases, that is, the
subjects with no NAs in the covariates selected for the model.
summary(brms_fit1)
## Family: gaussian
## Links: mu = identity; sigma = log
## Formula: gli_fev ~ dmg_sex + s(age, by = dmg_sex, bs = "cr", k = 5) + diagnosedafter1yo + s(yob, bs = "cr", k = 5) + f508_homozygous + (1 | ID | patient_id)
## sigma ~ dmg_sex + s(age, by = dmg_sex, bs = "cr", k = 5) + diagnosedafter1yo + s(yob, bs = "cr", k = 5) + f508_homozygous + (1 | ID | patient_id)
## Data: data_fit (Number of observations: 65522)
## Draws: 2 chains, each with iter = 2400; warmup = 1200; thin = 5;
## total post-warmup draws = 480
##
## Smooth Terms:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## sds(sagedmg_sexF_1) 0.02 0.01 0.01 0.06 1.00 480
## sds(sagedmg_sexM_1) 0.03 0.02 0.01 0.08 1.01 458
## sds(syob_1) 0.12 0.05 0.05 0.25 1.01 174
## sds(sigma_sagedmg_sexF_1) 0.11 0.04 0.05 0.21 1.00 430
## sds(sigma_sagedmg_sexM_1) 0.10 0.05 0.05 0.22 1.00 463
## sds(sigma_syob_1) 0.12 0.06 0.05 0.27 1.00 406
## Tail_ESS
## sds(sagedmg_sexF_1) 494
## sds(sagedmg_sexM_1) 389
## sds(syob_1) 445
## sds(sigma_sagedmg_sexF_1) 494
## sds(sigma_sagedmg_sexM_1) 477
## sds(sigma_syob_1) 363
##
## Group-Level Effects:
## ~patient_id (Number of levels: 7099)
## Estimate Est.Error l-95% CI u-95% CI Rhat
## sd(Intercept) 1.04 0.01 1.02 1.06 1.05
## sd(sigma_Intercept) 0.45 0.01 0.44 0.46 1.00
## cor(Intercept,sigma_Intercept) 0.16 0.02 0.13 0.20 1.00
## Bulk_ESS Tail_ESS
## sd(Intercept) 73 138
## sd(sigma_Intercept) 404 494
## cor(Intercept,sigma_Intercept) 399 492
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept 2.01 0.01 1.99 2.03 1.01 83
## sigma_Intercept -1.48 0.01 -1.51 -1.46 1.00 336
## dmg_sexM 0.10 0.01 0.08 0.12 1.02 153
## diagnosedafter1yo1 0.05 0.01 0.03 0.07 1.00 229
## f508_homozygous1 -0.05 0.01 -0.06 -0.03 1.01 148
## sigma_dmg_sexM 0.25 0.01 0.22 0.27 1.00 465
## sigma_diagnosedafter1yo1 -0.04 0.01 -0.07 -0.02 1.00 239
## sigma_f508_homozygous1 0.06 0.01 0.03 0.08 1.00 443
## sage:dmg_sexF_1 -0.27 0.00 -0.27 -0.26 1.00 375
## sage:dmg_sexM_1 -0.29 0.00 -0.30 -0.28 1.00 459
## syob_1 0.00 0.01 -0.02 0.02 1.00 355
## sigma_sage:dmg_sexF_1 -0.09 0.01 -0.11 -0.06 1.00 435
## sigma_sage:dmg_sexM_1 -0.08 0.01 -0.10 -0.06 1.00 461
## sigma_syob_1 -0.12 0.03 -0.19 -0.06 1.00 449
## Tail_ESS
## Intercept 188
## sigma_Intercept 492
## dmg_sexM 315
## diagnosedafter1yo1 299
## f508_homozygous1 356
## sigma_dmg_sexM 358
## sigma_diagnosedafter1yo1 419
## sigma_f508_homozygous1 469
## sage:dmg_sexF_1 465
## sage:dmg_sexM_1 453
## syob_1 358
## sigma_sage:dmg_sexF_1 436
## sigma_sage:dmg_sexM_1 422
## sigma_syob_1 378
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
To derive fitted values, a reference guide is in https://www.andrewheiss.com/blog/2022/09/26/guide-visualizing-types-posteriors/#tldr-diagrams-and-cheat-sheets
and https://www.andrewheiss.com/blog/2021/11/10/ame-bayes-re-guide/.
In particular, if you use re_formula = NULL
in the
fitted
command, you are including the random effects, while
if you use re_formula = NA
you are only getting the grand
mean (given the covariates). The smooth means of the fitted values with
re_formula = NULL
and the smooth means of the fitted values
with re_formula = NA
are exactly the same because the mean
of random effects is equal to zero.
fitted_mu <- fitted(brms_fit1, re_formula = NULL, scale = "response", dpar = "mu") %>%
as.data.frame()
colnames(fitted_mu) <- paste0("mu_", colnames(fitted_mu))
fitted_sigma <- fitted(brms_fit1, re_formula = NULL, scale = "response", dpar = "sigma") %>%
as.data.frame()
colnames(fitted_sigma) <- paste0("sigma_", colnames(fitted_sigma))
explanatory_after <- c("patient_id", "age", "dmg_sex", "diagnosedafter1yo", "dmg_dateofbirth", "f508_homozygous")
dependent <- "gli_fev"
fittedpars_withcovariates <- data_fit %>%
select(all_of(c(explanatory_after, dependent, "dmg_deathhaspatientdied", "Transplanted"))) %>%
drop_na() %>%
cbind(., fitted_mu, fitted_sigma)
fittedpars_withcovariates_reshaped <- fittedpars_withcovariates %>%
gather(dpar, Estimate, c("mu_Estimate", "sigma_Estimate")) %>%
mutate(ymin = ifelse(dpar == "sigma_Estimate", 0, 0.95),
ymax = ifelse(dpar == "sigma_Estimate", 0.7, 3.5)) %>%###ymin to make sure that y-axis in sigma subplots includes 0
select(-ends_with(c("Est.Error", "Q2.5", "Q97.5")))
###Plots
fittedpars_withcovariates_reshaped %>%
filter(patient_id %in% sample(fittedpars_withcovariates_reshaped$patient_id, 300)
& dpar == "sigma_Estimate") %>%
ggplot(aes(x = age, y = Estimate, group = patient_id, color = diagnosedafter1yo)) +
geom_line(alpha = 0.2) +
facet_grid( ~ dmg_sex, scales = "free") +
geom_smooth(data = filter(fittedpars_withcovariates_reshaped, dpar == "sigma_Estimate"),
aes(x = age, y = Estimate, group = diagnosedafter1yo, color = diagnosedafter1yo),
linewidth = 1.25) +
scale_color_manual(values = c("#5F4B8BFF", "#E69A8DFF"))
fittedpars_withcovariates_reshaped$dpar <- factor(fittedpars_withcovariates_reshaped$dpar,
labels = c(~ list("FEV"[1]~"mean"), ~ list("FEV"[1]~"SD")))
fittedpars_withcovariates_reshaped$dmg_sex <- factor(fittedpars_withcovariates_reshaped$dmg_sex,
labels = c(~ list("Females"), ~ list("Males")))
fittedpars_withcovariates_reshaped %>%
ggplot(aes(x = age, y = Estimate)) +
facet_grid(dpar ~ dmg_sex, scales = "free", labeller = label_parsed) +
geom_smooth(aes(x = age, y = Estimate,
group = dmg_deathhaspatientdied,
color = dmg_deathhaspatientdied,
fill = dmg_deathhaspatientdied),
size = 0, lty = 0, alpha = 0) +
geom_smooth(size = 1, col = 1, lty = 1) +
scale_color_manual(values = c( "#E69A8DFF","#5F4B8BFF"), labels = c("", "")) +
scale_fill_manual(values = c("#E69A8DFF", "#5F4B8BFF"), labels = c("", "")) +
geom_blank(aes(y = ymin)) + geom_blank(aes(y = ymax)) +
labs(x = "Age", y = "Average trajectory", fill = "", color = "", title = "")
fittedpars_withcovariates_reshaped %>%
ggplot(aes(x = age, y = Estimate,
group = dmg_deathhaspatientdied,
color = dmg_deathhaspatientdied,
fill = dmg_deathhaspatientdied,
lty = dmg_deathhaspatientdied)) +
facet_grid(dpar ~ dmg_sex, scales = "free", labeller = label_parsed) +
geom_smooth(size = 1) +
scale_color_manual(values = c("#445C40", "#E69A8DFF"), labels = c("No", "Yes")) +
scale_fill_manual(values = c("#445C40", "#E69A8DFF"), labels = c("No", "Yes")) +
scale_linetype_manual(values = c("11", "81"), labels = c("No", "Yes")) +
geom_blank(aes(y = ymin)) + geom_blank(aes(y = ymax)) +
xlim(18, 50) +
labs(x = "Age", y = "Average trajectory", fill = "Died during follow-up", color = "Died during follow-up", lty = "Died during follow-up", title = "")
As in the previous section, if you specify
re_formula = NA
you do not include any ID (you are getting
predictions for an average individual). If you need conditional effects
for existing individuals, then use re_formula = NULL
or
equivalently re_formula = ~ (1 | patient_id))
. If you want
a new hypothetical individual, you can use
patient_id = NA, allow_new_levels = TRUE, re_formula = ~ (1 | patient_ID)
and then you specify sample_new_levels
either equal to
“uncertainty” or “gaussian” in the prediction function.
yob_levels <- c(1963, 1984, 1993)
epred_byage <- brms_fit1 %>%
epred_draws(newdata = expand.grid(dmg_sex = c("F", "M"),
age = seq(18,50,by=2),
diagnosedafter1yo = c(0,1),
yob = yob_levels,
f508_homozygous = c(0,1), patient_id = "B155916"),
re_formula = NA, dpar = TRUE)
epred_byage_reshaped <- epred_byage %>%
gather(dpar, Estimate, c("mu", "sigma")) %>%
mutate(ymin = ifelse(dpar == "sigma", 0, 1.05),
ymax = ifelse(dpar == "sigma", 1, 2.75)) ###ymin to make sure that y-axis in sigma subplots includes 0
epred_byage_reshaped$dpar <- factor(epred_byage_reshaped$dpar,
labels = c(~ list("FEV"[1]~"mean"), ~ list("FEV"[1]~"SD")))
epred_byage_reshaped$dmg_sex <- factor(epred_byage_reshaped$dmg_sex,
labels = c(~ list("Females"), ~ list("Males")))
epred_byage_reshaped %>% filter(diagnosedafter1yo == 0, f508_homozygous == 1) %>%
ggplot(aes(x = age, y = Estimate,
color = rev(factor(yob)),
fill = rev(factor(yob)),
lty = rev(factor(yob)))) +
facet_grid(dpar ~ dmg_sex, scales = "free", labeller = label_parsed) +
geom_ribbon(aes(ymin = ymin, ymax = ymax), alpha = 0, colour = NA) + ###to get same y-axis across plots
stat_lineribbon(.width = 0.95, alpha = 1/4, show.legend = TRUE) +
scale_color_manual(values = c( "#445C40","#3E282B","#EF9806"),
labels = rev(yob_levels),
guide = guide_legend(reverse = TRUE)) +
scale_fill_manual(values = c("#445C40","#3E282B","#EF9806"),
labels = rev(yob_levels),
guide = guide_legend(reverse = TRUE)) +
scale_linetype_manual(values = c("11", "81", "solid"),
labels = rev(yob_levels),
guide = guide_legend(reverse = TRUE)) +
labs(x = "Age", y = "Prediction", linetype = "Year of birth",
fill = "Year of birth", color = "Year of birth",
title = "Homozygous F508, diagnosed before 1 year old") +
theme(plot.title=element_text(size=12))
epred_byage_reshaped %>% filter(diagnosedafter1yo == 0, yob == yob_levels[2]) %>%
ggplot(aes(x = age, y = Estimate,
color = factor(f508_homozygous),
fill = factor(f508_homozygous),
lty = factor(f508_homozygous))) +
facet_grid(dpar ~ dmg_sex, scales = "free", labeller = label_parsed) +
geom_ribbon(aes(ymin = ymin, ymax = ymax), alpha = 0, colour = NA) +
stat_lineribbon(.width = 0.95, alpha = 1/3, show.legend = TRUE) +
scale_color_manual(values = c("#3E282B","#EF9806"), labels = c("No", "Yes")) +
scale_fill_manual(values = c("#3E282B","#EF9806"), labels = c("No", "Yes")) +
scale_linetype_manual(values = c("11", "81"), labels = c("No", "Yes")) +
labs(x = "Age", y = "Prediction", linetype = "Homozygous F508",
fill = "Homozygous F508", color = "Homozygous F508",
title = paste0("Born in ", yob_levels[2], ", diagnosed before 1 year old")) +
theme(plot.title=element_text(size=12))
epred_byage_reshaped %>% filter(f508_homozygous == 1, yob == yob_levels[2]) %>%
ggplot(aes(x = age, y = Estimate,
color = factor(diagnosedafter1yo),
fill = factor(diagnosedafter1yo),
lty = factor(diagnosedafter1yo))) +
facet_grid(dpar ~ dmg_sex, scales = "free", labeller = label_parsed) +
geom_ribbon(aes(ymin = ymin, ymax = ymax), alpha = 0, colour = NA) +
stat_lineribbon(.width = 0.95, alpha = 1/3, show.legend = TRUE) +
scale_color_manual(values = c("#3E282B","#EF9806"), labels = c("No", "Yes")) +
scale_fill_manual(values = c("#3E282B","#EF9806"), labels = c("No", "Yes")) +
scale_linetype_manual(values = c("11", "81"), labels = c("No", "Yes")) +
labs(x = "Age", y = "Prediction", linetype = "Diagnosed after 1 year old",
fill = "Diagnosed after 1 year old", color = "Diagnosed after 1 year old",
title = paste0("Born in ", yob_levels[2], ", homozygous F508")) +
theme(plot.title=element_text(size=12))
cond_smooth_data <- lapply(conditional_smooths(brms_fit1, method = "fitted"), as.data.frame) %>%
bind_rows(.id = "coef_function") %>%
mutate(submodel = factor(str_extract(coef_function, "[^: ]+"), labels = c("Location submodel",
"Scale submodel")), xvar = factor(sapply(str_extract_all(coef_function, "[^(*),]+"),
"[[", 2)), byvar = str_match(coef_function, "by=\\s*(.*?)\\s*,")[, 2])
sexmaineffect <- summary(brms_fit1)$fixed[c("dmg_sexM", "sigma_dmg_sexM"), 1]
cond_smooth_data_WITHINTERACTION <- cond_smooth_data %>%
mutate_at(vars("estimate__", "lower__", "upper__"), funs(case_when(dmg_sex ==
"M" & submodel == "Location submodel" ~ . + sexmaineffect[1], dmg_sex ==
"M" & submodel == "Scale submodel" ~ . + sexmaineffect[2], TRUE ~ .)))
cond_smooth_data_WITHINTERACTION %>%
ggplot(., aes(x = effect1__, fill = dmg_sex)) + geom_line(aes(y = estimate__,
color = dmg_sex, linetype = dmg_sex)) + facet_grid(submodel ~ xvar, scales = "free",
switch = "y", labeller = labeller(xvar = c(age = "Age", yob = "Year of birth"),
submodel = c(`Location submodel` = "Mean trajectory", `Scale submodel` = "Variability (log SD)"))) +
geom_ribbon(aes(ymin = lower__, ymax = upper__), alpha = 0.3, colour = NA) +
labs(x = "", y = bquote("Association with FEV"[1]), fill = "Sex", color = "Sex",
linetype = "Sex") + theme(plot.margin = margin(t = 2, r = 10, b = 2, l = 2,
unit = "pt"), legend.position = "bottom", legend.margin = margin(0, 0, 0, 0,
unit = "pt"), legend.box.margin = margin(0, 0, 0, 0, unit = "pt"), panel.grid.minor = element_line(),
panel.spacing.y = unit(0.5, "lines"), axis.text = element_text(size = 10)) +
scale_color_manual(breaks = c("F", "M"), values = c("#F8766D", "#001D81")) +
scale_fill_manual(breaks = c("F", "M"), values = c("#F8766D", "#001D81")) + scale_linetype_manual(breaks = c("F",
"M"), values = c("81", "11"), na.value = "solid")
Let us look first at the standard deviation of the scale random effect. The probability that this parameter is higher than zero is very close to 1.
hyp <- c("sd_patient_id__sigma_Intercept^2 = 0")
hypothesis(brms_fit1, hyp, class = NULL)
## Hypothesis Tests for class :
## Hypothesis Estimate Est.Error CI.Lower CI.Upper Evid.Ratio
## 1 (sd_patient_id__s... = 0 0.2 0 0.19 0.21 NA
## Post.Prob Star
## 1 NA *
## ---
## 'CI': 90%-CI for one-sided and 95%-CI for two-sided hypotheses.
## '*': For one-sided hypotheses, the posterior probability exceeds 95%;
## for two-sided hypotheses, the value tested against lies outside the 95%-CI.
## Posterior probabilities of point hypotheses assume equal prior probabilities.
mcmc_dens(brms_fit1, pars = "sd_patient_id__sigma_Intercept") + labs(x = expression(sigma[omega]))
Bonus point: a subject specific random intercept is a category smoother (check https://www.tjmahr.com/random-effects-penalized-splines-same-thing/ to understand the equivalence between splines and random effects).
Now we create the dataset ranef_df
joining demographic
information and random effects.
visitcount <- data_fit %>%
group_by(patient_id) %>%
summarise(dmg_sex, no_visits = n(), ageatfirstvisit = first(age), ageatdiagnosis = first(dmg_ageatdiagnosis),
death_status = !is.na(dmg_dateofdeath)) %>%
distinct(across(everything()))
ranef_df0 <- ranef(brms_fit1, groups = "patient_id", probs = c(0.025, 0.975)) %>%
data.frame()
names(ranef_df0) <- str_remove(names(ranef_df0), "patient_id.")
ranef_df <- tibble::rownames_to_column(ranef_df0, var = "patient_id") %>%
left_join(., visitcount, by = "patient_id")
repeated_id <- ranef_df %>%
count(patient_id) %>%
filter(n > 1) %>%
pull(patient_id) ##appear twice in ranef_df ###recorded as both males and females
ranef_df <- filter(ranef_df, patient_id %ni% repeated_id)
ranef_df <- ranef_df %>%
mutate(sub_muintercept = Estimate.Intercept + summary(brms_fit1)$fixed["Intercept",
1], sub_sigmaintercept = exp(Estimate.sigma_Intercept + summary(brms_fit1)$fixed["sigma_Intercept",
1]))
ggplot(data = ranef_df[1:500, ], aes(x = Estimate.Intercept, y = Estimate.sigma_Intercept)) +
geom_linerange(aes(xmin = Q2.5.Intercept, xmax = Q97.5.Intercept), alpha = 0.1) +
geom_linerange(aes(ymin = Q2.5.sigma_Intercept, ymax = Q97.5.sigma_Intercept),
alpha = 0.1) + geom_point(aes(color = dmg_sex), alpha = 0.4, size = 0.9) +
geom_smooth(data = ranef_df, aes(x = Estimate.Intercept, y = Estimate.sigma_Intercept),
method = "lm", se = TRUE, col = 1) + geom_smooth(data = ranef_df, aes(x = Estimate.Intercept,
y = Estimate.sigma_Intercept, color = dmg_sex, fill = dmg_sex, linetype = dmg_sex),
method = "gam", formula = y ~ s(x, bs = "cs", k = 5), size = 1.25) + scale_color_manual(breaks = c("F",
"M"), values = c("#F8766D", "#001D81")) + scale_fill_manual(breaks = c("F", "M"),
values = c("#F8766D", "#001D81")) + scale_linetype_manual(breaks = c("F", "M"),
values = c("longdash", "solid"), na.value = "solid") + labs(x = "Location random effects",
y = "Scale random effects", color = "Sex", fill = "Sex", linetype = "Sex") +
theme(panel.grid.major = element_blank())
ggplot(data = ranef_df, aes(x = Estimate.sigma_Intercept, y = factor(no_visits),
fill = stat(x))) + geom_density_ridges_gradient(alpha = 0.6, rel_min_height = 0.01) +
theme_ridges() + labs(x = "Scale random effect", y = "Number of encounters") +
scale_fill_viridis_c(option = "C") + theme(legend.position = "none")
If you want to check autocorrelation of the MCMC samples, you can run the following code.
mcmc_plot(brms_fit1, type = "acf") + theme(strip.text.x = element_text(size = 14,
angle = 90)) ###you would like to see L-shaped plots
The model fit took approximately 20.44 hours.
Posterior predictive distribution:
\[p(\tilde{y}| y ) = \int p(\tilde{y}|\theta)p(\theta|y)d\theta\]
pp_check(brms_fit1) + theme(legend.position = "bottom") ##density overlay
pp_check(brms_fit1, type = "ecdf_overlay")
For multiple models, model comparison is performed in terms of best predictive ability, via either Bayesian LOO-CV and Watanabe AIC (WAIC, asymptotically equivalent to LOO-CV). The best model is the one with lower LOO-CV. Larger ELPD values indicate better fit.
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United Kingdom.1252
## [2] LC_CTYPE=English_United Kingdom.1252
## [3] LC_MONETARY=English_United Kingdom.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United Kingdom.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] emmeans_1.8.2 survival_3.4-0 tidybayes_3.0.4 bayesplot_1.10.0
## [5] posterior_1.4.1 brmsmargins_0.2.0 brms_2.19.0 Rcpp_1.0.8.3
## [9] rsvg_2.3.2 magrittr_2.0.3 DiagrammeRsvg_0.1 DiagrammeR_1.0.9
## [13] pipeR_0.6.1.3 ggridges_0.5.4 ggpubr_0.6.0 stringr_1.5.0
## [17] kableExtra_1.3.4 knitr_1.42 lubridate_1.8.0 ggplot2_3.4.2
## [21] tidyr_1.3.0 dplyr_1.1.2 pacman_0.5.1
##
## loaded via a namespace (and not attached):
## [1] backports_1.4.1 systemfonts_1.0.4 plyr_1.8.7
## [4] igraph_1.3.1 splines_4.1.2 svUnit_1.0.6
## [7] crosstalk_1.2.0 rstantools_2.2.0 inline_0.3.19
## [10] digest_0.6.29 htmltools_0.5.2 fansi_1.0.3
## [13] checkmate_2.1.0 extraoperators_0.1.1 RcppParallel_5.1.5
## [16] matrixStats_0.62.0 xts_0.12.1 svglite_2.1.0
## [19] prettyunits_1.1.1 colorspace_2.0-3 rvest_1.0.3
## [22] ggdist_3.1.1 xfun_0.39 callr_3.7.0
## [25] crayon_1.5.1 jsonlite_1.8.0 zoo_1.8-10
## [28] glue_1.6.2 gtable_0.3.0 webshot_0.5.4
## [31] V8_4.2.0 distributional_0.3.0 car_3.1-0
## [34] pkgbuild_1.3.1 rstan_2.26.13 abind_1.4-5
## [37] scales_1.2.0 mvtnorm_1.1-3 rstatix_0.7.2
## [40] miniUI_0.1.1.1 viridisLite_0.4.0 xtable_1.8-4
## [43] finalfit_1.0.4 stats4_4.1.2 StanHeaders_2.26.13
## [46] DT_0.22 datawizard_0.6.3 htmlwidgets_1.5.4
## [49] httr_1.4.2 threejs_0.3.3 arrayhelpers_1.1-0
## [52] RColorBrewer_1.1-3 ellipsis_0.3.2 mice_3.14.0
## [55] pkgconfig_2.0.3 loo_2.5.1 farver_2.1.0
## [58] sass_0.4.0 utf8_1.2.2 labeling_0.4.2
## [61] tidyselect_1.2.0 rlang_1.1.0 reshape2_1.4.4
## [64] later_1.3.0 munsell_0.5.0 tools_4.1.2
## [67] visNetwork_2.1.2 cli_3.4.1 generics_0.1.2
## [70] broom_1.0.1 evaluate_0.15 fastmap_1.1.0
## [73] yaml_2.3.5 processx_3.5.3 purrr_1.0.1
## [76] nlme_3.1-153 mime_0.12 formatR_1.12
## [79] xml2_1.3.3 compiler_4.1.2 shinythemes_1.2.0
## [82] rstudioapi_0.13 curl_4.3.2 ggsignif_0.6.3
## [85] tibble_3.2.1 bslib_0.3.1 stringi_1.7.6
## [88] highr_0.9 ps_1.6.0 Brobdingnag_1.2-7
## [91] forcats_0.5.1 lattice_0.20-45 Matrix_1.3-4
## [94] markdown_1.1 shinyjs_2.1.0 tensorA_0.36.2
## [97] vctrs_0.6.1 pillar_1.9.0 lifecycle_1.0.3
## [100] jquerylib_0.1.4 bridgesampling_1.1-2 estimability_1.4.1
## [103] cowplot_1.1.1 data.table_1.14.2 insight_0.18.6
## [106] httpuv_1.6.5 R6_2.5.1 promises_1.2.0.1
## [109] gridExtra_2.3 codetools_0.2-18 boot_1.3-28
## [112] colourpicker_1.1.1 gtools_3.9.2 withr_2.5.0
## [115] shinystan_2.6.0 mgcv_1.8-38 bayestestR_0.13.0
## [118] parallel_4.1.2 grid_4.1.2 coda_0.19-4
## [121] rmarkdown_2.13 carData_3.0-5 shiny_1.7.1
## [124] base64enc_0.1-3 dygraphs_1.1.1.6