This Supplemental file accompanies the manuscript: “Accuracy of computer-aided chest X-ray screening in the Kenya National Tuberculosis Prevalence Survey” by Mungai et al.
The Kenya National Tuberculosis, Leprosy and Lung Disease program is the custodian of the 2016 Kenya Tuberculosis Prevalence Survey data. As data cannot be made openly available, here we provide code and output to summarise the analysis and modelling approach taken.
Analysis was undertaken using R version 4.1.1.
library(tidyverse)
library(rstan)
library(tidybayes)
library(here)
library(posterior)
library(bayesplot)
library(cmdstanr)
library(arsenal)
library(scales)
library(RColorBrewer)
library(patchwork)
library(DiagrammeR)
Summarise the characteristics of the TB Prevalence Survey participants
#Load the raw dataset
load("kenyacad.rda")
#Some recoding and tidying up
#recode characteristics
kenyacad <- kenyacad %>%
mutate(field_read2 =case_when(xfindingall==1 ~ "Normal",
xfindingall==2 ~ "Suggestive of TB",
xfindingall==3 ~ "Abnormal, other")) %>%
mutate(c2w = case_when(
coughing=="Yes" & coughingweeksnumber >=2 ~ "Cough >=2w",
TRUE ~ "No cough >=2w"))%>%
mutate(tbeverbeentreated = case_when(tbeverbeentreated==1 ~ "Previous_TB",
tbeverbeentreated==0 ~ "Never_TB")) %>%
mutate(hiv = case_when(hiv==1 ~ "HIV_positive",
hiv==0 ~ "HIV_negative")) %>%
mutate(tbcurrenttreatment = case_when(tbcurrenttreatment=="Not taking TB treatment" ~ "Not_taking_TB_treatment",
tbcurrenttreatment=="Taking TB treatment" ~ "Taking_TB_treatment")) %>%
mutate(ageg2 = case_when(agegp=="15 - 24" ~ "below_45",
agegp=="25 - 34" ~ "below_45",
agegp=="35 - 44" ~ "below_45",
agegp=="45 - 54" ~ "above_45",
agegp=="55 - 64" ~ "above_45",
agegp=="65+" ~ "above_45")) %>%
mutate(tested = case_when(xpert_culture=="Positive" | xpert_culture=="Negative" ~ "Sputum_tested",
is.na(xpert_culture) ~ "Sputum_not_tested"))
#Set up table 1
mycontrols <- tableby.control(
numeric.stats=c("N", "median", "q1q3"),
stats.labels=list(N='Count', median='Median', q1q3='Q1,Q3'),
cat.test = "fe",
simulate.p.value=TRUE,
digits = 1,
numeric.simplify = TRUE)
#rename some variables for nice printing
t1_data <- kenyacad %>%
select(field_read2, `Age group`=agegp, Sex=sex, `HIV-status`=hiv,
Cough=coughing, `Cough>2 weeks`=c2w, Fever=fever, `Weight loss`=weightloss,
`Night sweats`=nightsweats, `Current TB treatment`=tbcurrenttreatment,
`Previous TB treatment`=tbeverbeentreated, CAD4TBv6=cad4tb6) %>%
mutate(Sex = case_when(Sex=="M" ~ "Male",
Sex=="F" ~ "Female"),
`HIV-status`=case_when(`HIV-status`=="HIV_positive" ~ "HIV-positive",
`HIV-status`=="HIV_negative" ~ "HIV-negative"),
`Cough>2 weeks`=case_when(`Cough>2 weeks`=="Cough >=2w" ~ "Yes",
`Cough>2 weeks`=="No cough >=2w" ~ "No"),
`Current TB treatment` = case_when(`Current TB treatment`=="Taking_TB_treatment" ~ "Yes",
`Current TB treatment`=="Not_taking_TB_treatment" ~ "No"),
`Previous TB treatment` = case_when(`Previous TB treatment`=="Previous_TB" ~ "Yes",
`Previous TB treatment`=="Never_TB" ~ "No"))
#Display table 1
t1a <- tableby(field_read2 ~ `Age group` + Sex + `HIV-status` + Cough + `Cough>2 weeks` +
Fever + `Weight loss` + `Night sweats` + `Current TB treatment` +
`Previous TB treatment` + CAD4TBv6, data=t1_data, control=mycontrols) %>%
summary(term.name = "", text = NULL)
t1a
Abnormal, other (N=5045) | Normal (N=50045) | Suggestive of TB (N=6758) | Total (N=61848) | p value | |
---|---|---|---|---|---|
Age group | < 0.001 | ||||
15 - 24 | 403 (8.0%) | 16305 (32.6%) | 742 (11.0%) | 17450 (28.2%) | |
25 - 34 | 627 (12.4%) | 13400 (26.8%) | 1167 (17.3%) | 15194 (24.6%) | |
35 - 44 | 765 (15.2%) | 9072 (18.1%) | 1273 (18.8%) | 11110 (18.0%) | |
45 - 54 | 831 (16.5%) | 5553 (11.1%) | 1032 (15.3%) | 7416 (12.0%) | |
55 - 64 | 901 (17.9%) | 3210 (6.4%) | 957 (14.2%) | 5068 (8.2%) | |
65+ | 1518 (30.1%) | 2505 (5.0%) | 1587 (23.5%) | 5610 (9.1%) | |
Sex | < 0.001 | ||||
Female | 3410 (67.6%) | 29432 (58.8%) | 3345 (49.5%) | 36187 (58.5%) | |
Male | 1635 (32.4%) | 20613 (41.2%) | 3413 (50.5%) | 25661 (41.5%) | |
HIV-status | < 0.001 | ||||
N-Miss | 2871 | 23895 | 3587 | 30353 | |
HIV-negative | 2055 (94.5%) | 24985 (95.5%) | 2878 (90.8%) | 29918 (95.0%) | |
HIV-positive | 119 (5.5%) | 1165 (4.5%) | 293 (9.2%) | 1577 (5.0%) | |
Cough | < 0.001 | ||||
No | 4065 (80.6%) | 43989 (87.9%) | 4752 (70.3%) | 52806 (85.4%) | |
Yes | 980 (19.4%) | 6056 (12.1%) | 2006 (29.7%) | 9042 (14.6%) | |
Cough>2 weeks | < 0.001 | ||||
No | 4504 (89.3%) | 47859 (95.6%) | 5484 (81.1%) | 57847 (93.5%) | |
Yes | 541 (10.7%) | 2186 (4.4%) | 1274 (18.9%) | 4001 (6.5%) | |
Fever | < 0.001 | ||||
No | 4408 (87.4%) | 47088 (94.1%) | 5566 (82.4%) | 57062 (92.3%) | |
Yes | 637 (12.6%) | 2957 (5.9%) | 1192 (17.6%) | 4786 (7.7%) | |
Weight loss | < 0.001 | ||||
No | 4921 (97.5%) | 49173 (98.3%) | 6212 (91.9%) | 60306 (97.5%) | |
Yes | 124 (2.5%) | 872 (1.7%) | 546 (8.1%) | 1542 (2.5%) | |
Night sweats | < 0.001 | ||||
No | 4046 (80.2%) | 45546 (91.0%) | 5077 (75.1%) | 54669 (88.4%) | |
Yes | 999 (19.8%) | 4499 (9.0%) | 1681 (24.9%) | 7179 (11.6%) | |
Current TB treatment | < 0.001 | ||||
No | 5043 (100.0%) | 50023 (100.0%) | 6724 (99.5%) | 61790 (99.9%) | |
Yes | 2 (0.0%) | 22 (0.0%) | 34 (0.5%) | 58 (0.1%) | |
Previous TB treatment | < 0.001 | ||||
No | 4883 (96.8%) | 49171 (98.3%) | 5710 (84.5%) | 59764 (96.6%) | |
Yes | 162 (3.2%) | 874 (1.7%) | 1048 (15.5%) | 2084 (3.4%) | |
CAD4TBv6 | < 0.001 | ||||
Count | 5045 | 50045 | 6758 | 61848 | |
Median | 48.0 | 43.0 | 52.0 | 44.0 | |
Q1,Q3 | 45.0, 53.0 | 24.0, 46.0 | 46.0, 62.0 | 27.0, 48.0 |
s1 <- kenyacad %>%
group_by(tested) %>%
median_hdci(cad4tb6) %>%
mutate(text = paste0("Median: ", cad4tb6, " (95%HDI: ",.lower, "-",.upper,")"))
s1
s2 <- kenyacad %>%
filter(!is.na(xpert_culture)) %>%
group_by(xpert_culture) %>%
median_hdci(cad4tb6) %>%
mutate(text = paste0("Median: ", cad4tb6, " (95%HDI: ",.lower, "-",.upper,")"))
s2
nrow_tested <- kenyacad %>%
filter(tested=="Sputum_tested") %>%
nrow(.)
nrow_nottested <- kenyacad %>%
filter(tested=="Sputum_not_tested") %>%
nrow(.)
p1 <- kenyacad %>%
mutate(tested = case_when(tested=="Sputum_tested" ~ paste0("Sputum tested ", "(n=", comma(nrow_tested),")", sep=""),
tested=="Sputum_not_tested" ~ paste0("Sputum not tested ", "(n=", comma(nrow_nottested),")", sep=""))) %>%
ggplot(aes(x=cad4tb6, tested)) +
stat_halfeye(point_interval = median_hdi, .width = c(.95), fill="skyblue") +
theme_bw(base_size = 10) +
scale_x_continuous(limits = c(0,99), breaks = c(0,25,50,75,99), labels = c(0,25,50,75,99)) +
labs(x = "CAD4TBv6 score",
y="",
subtitle = "By whether sputum tested",
caption = "Sputum tested: median=49 (95%HDI: 9-82)\nSputum not tested: median=44 (95%HDI: 4-55)")
p1
nrow_positive <- kenyacad %>%
filter(xpert_culture=="Positive") %>%
nrow(.)
nrow_negative <- kenyacad %>%
filter(xpert_culture=="Negative") %>%
nrow(.)
p2 <- kenyacad %>%
filter(!is.na(xpert_culture)) %>%
mutate(xpert_culture = case_when(xpert_culture=="Positive" ~ paste0("Sputum positive ", "(n=", comma(nrow_positive),")", sep=""),
xpert_culture=="Negative" ~ paste0("Sputum negative ", "(n=", comma(nrow_negative),")", sep=""))) %>%
ggplot(aes(x=cad4tb6, xpert_culture)) +
stat_halfeye(point_interval = median_hdi, .width = c(.95), fill="orange") +
theme_bw(base_size = 10) +
scale_x_continuous(limits = c(0,99), breaks = c(0,25,50,75,99), labels = c(0,25,50,75,99)) +
labs(x = "CAD4TBv6 score",
y="",
subtitle = "By sputum microbiological results (if tested)",
caption = "Sputum-positive: median=72 (95%HDI: 38-98)\nSputum-negative: median=49 (95%HDI: 8-79)")
p2
pw <- p1 / p2
pw + plot_annotation(
tag_levels = 'A' )&
theme(plot.tag = element_text(face = 'bold'),
title = element_text(size=10))
ggsave(here("fig2.png"), width=8, height=10, dpi=600)
The model is composed of three component models: TB prevalence, Standard testing, and CAD4TBv6 models. To compare the accuracy of standard testing and CAD4TBv6, we used a Bayesian latent class model for canonical correlation analysis for linking to TB status, which is not directly observable, instead of assuming either of them as a reference standard, to the results of the two sets of tests.
For implementing the two testing components, we used a mixture distribution approach. That is, there were two counterfactuals of TB and non-TB for each subject surveyed; each counterfactual has a respective likelihood for each test, and each subject has a probability of having TB. The likelihood of the screening models is the average likelihood of test results for TB and non-TB weighted by the probabilities of having TB or not. Across all tests included, we applied the assumption that we assumed both standard testing and CAD4TBv6 are better than flipping a fair coin, at least. That is, the TB status relates a higher chance of positivity from standard TB screening and a higher CAD4TBv6 score.
The TB prevalence model links the determinants of prevalent TB and the TB status. We structure the TB prevalence model with a logistic regression model for estimating the TB prevalence. The TB status is not directly supported by the data, so the probability of TB is related to the test results of standard testing and CAD4TBv6. The sub-model is formulated as follows:
\[\begin{align} D_i &\sim Bernouilli(p_i)\\ p_i &= logit(p_0) + \lambda X \\ logit(p_0) &\sim Normal(-5.219282, 0.4076352) \\ \lambda &\sim Normal(0, 1) \end{align}\]
where \(d_i\) denotes the status of TB disease of subject \(i\) which is a latent class variable, \(\lambda\) denotes the coefficients linking the covariates of \(X\) to the probability of \(D_i\). The prior for \(logit(p_0)\) was inferred from the results of the TB prevalence survey.
We modelled CAD4TBv6 with Beta regression for proportional data or other data bounded between 0 and 1. The CAD4TB evaluates CXR images and scores them from 0 to 99. We remap the scores to a variable ranging from 0 to 1.
The CAD4TBv6 model is formulated as follows:
For the counterfactual with TB (\(D_i = 1\)):
\[\begin{align} Y_{CAD, i} &\sim Beta(gA_1, gB_1)\\ gA_1 &= \nu_1 \kappa \\ gB_1 &= (1 - \nu_1) \kappa \\ logit(\nu_1) &= \gamma_1 + \gamma X \end{align}\]
For the counterfactual without TB (\(d_i = 0\)):
\[\begin{align} Y_{CAD, i} &\sim Beta(gA_0, gB_0)\\ gA_0 &= \nu_0 \kappa \\ gB_0 &= (1 - \nu_0) \kappa \\ logit(\nu_0) &= \gamma_0 + \gamma X \end{align}\]
with prior distributions: \[\begin{align} \gamma &\sim Normal(0, 10) \\ \gamma_0 &\sim Normal(0, 10) \\ \gamma_1 - \gamma_0 &\sim HalfNormal(0, 1) \\ \kappa &\sim inverseGamma(1, 1) \end{align}\]
where \(Y_{CAD, i}\) denotes the score of CAD4TBv6,;`\(\gamma\) denotes the coefficients linking the covariates of \(X\), \(\gamma_1\) and \(\gamma_0\) are the intercept terms of the regression function with and without TB respectively; \(\kappa\) is a scale parameter determining the variance of the distribution. Lastly, we weighted the two counterfactuals by the probabilities of \(D_i = 1\) or not.
In this study, standard testing is a TB detection approach used commonly in TB prevalence surveys. The algorithm started with a CXR read by a human reader and initial consultation followed by a sputum microbiological testing on those with CXR-abnormalities or TB-suggestive symptoms (here cough >=2 weeks).
We modelled the results of CXR read by human readers with a logistic regression model. The TB and non-TB models share the same coefficients of predictors, while the counterfactual with TB has a larger value of intercept term which implied the subjects with TB are more likely to have positive results.
Following CXR read by human readers, we modelled the results of sputum microbiological testing with a Bernoulli process with the probability of sensitivity of TB and (1 – specificity) of non-TB. Considering the quality of sputum samples are symptom related, we assumed the sensitivity differs by coughing >= 2 weeks status. For an identifiability issue that the sensitivity and the specificity cannot simultaneously be characterised in the model, we used an exogenously given specificity of 99%.
The standard testing model is formulated as follows:
For the counterfactual with TB (\(D_i = 1\)):
\[\begin{align} Y_{CO, i} &\sim Bernouilli(r_i)\\ r_i &= \beta_0 + \beta X\\ \beta &\sim Normal(0, 10) \\ \beta_0 &\sim Normal(0, 10) \end{align}\] if sputum sample collected and tested: \[\begin{align} Y_{Sput, i} &\sim Bernouilli(sens_{sput})\\ \end{align}\]
For the counterfactual without TB (\(D_i = 0\)):
\[\begin{align} Y_{CO, i} &\sim Bernouilli(r_i)\\ r_i &= \beta_1 + \beta X\\ \end{align}\] if sputum sample collected and tested: \[\begin{align} Y_{Sput, i} &\sim Bernouilli(1 - spec_{sput}) \end{align}\]
with prior distributions: \[\begin{align} \beta &\sim Normal(0, 10) \\ \beta_0 &\sim Normal(0, 10) \\ \beta_1 - \beta_0 &\sim HalfNormal(0, 1) \\ 2*(sens_{sput} - 0.5) &\sim Beta(2, 4) \end{align}\]
where \(Y_{CO, i}\) is the results of CXR read by clinical officers; \(\beta\) denotes the coefficients linking the covariates of \(X\) to CXR positivities; \(\beta_1\) and \(\beta_0\) are the intercept terms of the regression function with and without TB respectively. \(Y_{Sput, i}\) is the results of sputum tests; \(sens_{sput}\) and \(spec_{sput}\) denote the sensitivity and specificity of sputum tests the values related with/without coughing \(\ge\) 2 weeks. Lastly, we weighted the two counterfactuals by the probabilities of \(D_i = 1\) or not.
Code below runs the Stan model (is computationally intensive, so commented out here).
##read the tidied dataset
# dat <- read_rds("2021-06-17_kcxr.rds")
# #Define model inputs as a list
# inputs <- with(dat, {
# list(
# N = nrow(dat),
# CAD = CAD,
# CO = CO,
# Spu = Spu,
# SpuMis = c(SpuMis),
# N_Cov = 3,
# Covs = cbind(Male, Before, Age_centre),
# Cough2w = c(Cough2w),
# spec_sput = 0.99,
# spec_sput_c2w = 0.99
# )
# })
# #Define modelling sampling parameters
# n_iter = 4000 #Number of posterior samples
# n_collect = 2000 #Number of post warm-up samples
# n_chain = 3 #number of chains
# n_core = 1 #number of cores
## Set up the model
## This calls the model_c_mono.stan file
# file <- file.path("model_c_mono.stan")
##Now fit the model
##WARNING: THIS IS SLOW AND MEMORY INTENSIVE!!!
##Saves about a 30Gb object
# model <- cmdstan_model(file)
##Now extract the posterios from the cmdstanr model object
# csv_files <- c("model_c_mono-202110040818-1-03568b.csv",
# "model_c_mono-202110040818-2-03568b.csv",
# "model_c_mono-202110040818-3-03568b.csv")
#
# x <- read_cmdstan_csv(
# csv_files,
# variables = c("sens_sput", "sens_sput_c2w", "lambda", "beta", "gamma", "gamma0", "gamma1", "k", "beta0", "beta1", "logit_p"),
# sampler_diagnostics = ""
# )
#
# saveRDS(x, "x.rds")
#
# csv_files <- c("model_c_mono-202110040818-1-03568b.csv",
# "model_c_mono-202110040818-2-03568b.csv",
# "model_c_mono-202110040818-3-03568b.csv")
#
# co_out <- read_cmdstan_csv(
# csv_files,
# variables = c("sens_co", "spec_co"),
# sampler_diagnostics = ""
# )
#
# saveRDS(co_out, "co_out.rds")
Read in the data and the modelling output.
#read in the cmdstanr model output object
posterior <- read_rds(here("posterior.rds"))
#read the data in
dat <- read_rds("2021-06-17_kcxr.rds")
dat <- dat %>%
mutate(Age_centre = scale(Age))
#read in the posterior samples
x <- read_rds(here("x.rds"))
co_out <- read_rds("co_out.rds")
#The inputs that we used for the model.
#Needed for the summarise function
inputs <- with(dat, {
list(
N = nrow(dat),
CAD = CAD,
CO = CO,
Spu = Spu,
SpuMis = c(SpuMis),
N_Cov = 3,
Covs = cbind(Male, Before, Age_centre),
Cough2w = c(Cough2w),
spec_sput = 0.99,
spec_sput_c2w = 0.99
)
})
posterior::summarise_draws(x$post_warmup_draws)
posterior::summarise_draws(co_out$post_warmup_draws)
mcmc_trace(x$post_warmup_draws)
mcmc_trace(co_out$post_warmup_draws)
Function to summarise prevalence estimates.
summarise_prev <- function(post, inputs, keys, dat, itermax = 1000) {
require(tidyverse)
require(posterior)
inv_logit <- function(x) { exp(x) / (1 + exp(x))}
covs <- inputs$Covs
pars <- as_tibble(as_draws_df(post$post_warmup_draws))
pars <- list(
lambda = pars %>% select(starts_with("lambda[")) %>% as.matrix(),
logit_p = pars %>% select(logit_p) %>% pull()
)
n_iter <- nrow(pars$gamma)
n_sample <- nrow(covs)
itermax <- min(n_iter, itermax)
# calculate prevalence
prev <- bind_rows(lapply(1:itermax, function(i) {
tibble(ID = keys, MC = i,
p = inv_logit(pars$logit_p[i] + colSums(t(covs) * pars$lambda[i, ])))
})) %>% filter(ID %in% keys)
prev %>% left_join(dat %>% mutate(ID = 1:n()))
}
#Overall prevalence
selected <- 1:inputs$N # all samples
prev <- summarise_prev(x, inputs, keys=selected, dat)
## Joining, by = "ID"
Summary_Prev <- bind_rows(
prev %>%
group_by(MC) %>%
summarise(p = sum(p) / n()) %>%
ungroup() %>%
summarise(
Group = "All",
Prev = mean(p),
Prev_L = quantile(p, 0.025),
Prev_U = quantile(p, 0.975),
),
prev %>%
group_by(MC, Male) %>% ## select MC and grouping
summarise(p = sum(p) / n()) %>%
group_by(Male) %>% ## keep grouping only
summarise(
Group="Stratified",
Prev = mean(p),
Prev_L = quantile(p, 0.025),
Prev_U = quantile(p, 0.975),
) %>% ungroup()
)
## `summarise()` has grouped output by 'MC'. You can override using the `.groups` argument.
Summary_Prev %>%
mutate_at(.vars = vars(Prev, Prev_L, Prev_U),
.funs = funs(.*100000))
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
Clinical Officer read accuracy
summarise_co <- function(posterior, inputs, keys, itermax = 1000) {
require(tidyverse)
require(posterior)
inv_logit <- function(x) { exp(x) / (1 + exp(x))}
logit <- function(p) {log(p / (1 - p))}
covs <- inputs$Covs
pars <- as_tibble(as_draws_df(posterior$post_warmup_draws))
pars <- list(
beta0 = pars %>% select(beta0) %>% pull(),
beta1 = pars %>% select(beta1) %>% pull(),
beta = pars %>% select(starts_with("beta[")) %>% as.matrix(),
k = pars %>% select(k) %>% pull()
)
n_iter <- nrow(pars$beta)
n_sample <- nrow(covs)
if (missing(keys)) keys <- 1:n_sample
itermax <- min(n_iter, itermax)
covs <- covs[keys, ]
# sample clinical officer values
co <- bind_rows(lapply(1:itermax, function(i) {
tibble(ID = keys, MC = i,
p0 = inv_logit(pars$beta0[i] + colSums(t(covs) * pars$beta[i, ])),
p1 = inv_logit(pars$beta1[i] + colSums(t(covs) * pars$beta[i, ])))
})) %>% filter(ID %in% keys) %>%
group_by(MC) %>%
summarise(
SensCO = mean(p1),
SpecCO = mean(1 - p0)
) %>%
ungroup() %>%
summarise(
across(c(SensCO, SpecCO), list(
M = mean,
L = function(x) quantile(x, 0.025),
U = function(x) quantile(x, 0.975)
))
) %>%
pivot_longer(everything(), names_to = "Index") %>%
tidyr::extract(Index, c("Index", "Stat"), "(\\w+)_(M|L|U)") %>%
pivot_wider(names_from = Stat)
co
}
itermax <- 1000
sel_all <- 1:inputs$N
sel_new <- which(inputs$Covs[, 2] == 0)
sel_rel <- which(inputs$Covs[, 2] == 1)
co_all <- summarise_co(posterior = x, inputs = inputs, keys = sel_all, itermax = itermax)
co_new <- summarise_co(posterior = x, inputs = inputs, keys = sel_new, itermax = itermax)
co_rel <- summarise_co(posterior = x, inputs = inputs, keys = sel_rel, itermax = itermax)
co_all
co_new
co_rel
The accuracy of Clinical Officer then appears to vary substantially by previous TB status, with sensitivity being much higher in those with a history of previous TB treatment, and lower in those not previously treated for TB.
sel_female <- which(inputs$Covs[, 1] == 0)
sel_male <- which(inputs$Covs[, 1] == 1)
co_male <- summarise_co(posterior = x, inputs = inputs, keys = sel_male, itermax = itermax)
co_female <- summarise_co(posterior = x, inputs = inputs, keys = sel_female, itermax = itermax)
co_male
co_female
No difference by sex though.
sel_nocough <- which(inputs$Cough2w == 0)
sel_cough2w <- which(inputs$Cough2w == 1)
co_nocough <- summarise_co(posterior = x, inputs = inputs, keys = sel_nocough, itermax = itermax)
co_cough2w <- summarise_co(posterior = x, inputs = inputs, keys = sel_cough2w, itermax = itermax)
co_nocough
co_cough2w
And sensitivity was a little higher among those with cough of >2w.
Function to summarise conditional posterior draws
summarise_cad <- function(posterior, inputs, keys, itermax = 1000) {
require(tidyverse)
require(posterior)
inv_logit <- function(x) { exp(x) / (1 + exp(x))}
covs <- inputs$Covs
pars <- as_tibble(as_draws_df(x$post_warmup_draws))
pars <- list(
gamma0 = pars %>% select(gamma0) %>% pull(),
gamma1 = pars %>% select(gamma1) %>% pull(),
gamma = pars %>% select(starts_with("gamma[")) %>% as.matrix(),
k = pars %>% select(k) %>% pull()
)
n_iter <- nrow(pars$gamma)
n_sample <- nrow(covs)
if (missing(keys)) keys <- 1:n_sample
itermax <- min(n_iter, itermax)
covs <- covs[keys, ]
# sample CAD values
cad <- bind_rows(lapply(1:itermax, function(i) {
tibble(ID = keys, MC = i,
nu0 = inv_logit(pars$gamma0[i] + colSums(t(covs) * pars$gamma[i, ])),
nu1 = inv_logit(pars$gamma1[i] + colSums(t(covs) * pars$gamma[i, ]))) %>%
mutate(
gA0 = nu0 * pars$k[i],
gB0 = (1 - nu0) * pars$k[i],
gA1 = nu1 * pars$k[i],
gB1 = (1 - nu1) * pars$k[i],
CAD0 = rbeta(n(), gA0, gB0),
CAD1 = rbeta(n(), gA1, gB1)
)
})) %>% filter(ID %in% keys)
# calculate sensivity and specificity by cut points
rocs <- bind_rows(lapply(seq(0, 1, 0.01), function(cut) {
cad %>%
group_by(MC) %>%
summarise(Sens = mean(CAD1 > cut), Spec = mean(CAD0 < cut)) %>%
mutate(Cut = cut)
}))
# ROC intervals based on sensitivity
rocs_mci_sens <- bind_rows(lapply(unique(rocs$MC) , function(mc) {
sel <- rocs %>%
filter(MC == mc)
as_tibble(approx(sel$Sens, sel$Spec, seq(0, 1, 0.01))) %>%
rename(Sens = x, Spec = y) %>%
mutate(MC = mc)
})) %>%
group_by(Sens) %>%
summarise(Spec_M = median(Spec), Spec_L = quantile(Spec, .025), Spec_U = quantile(Spec, .975))
# ROC intervals based on specificity
rocs_mci_spec <- bind_rows(lapply(unique(rocs$MC) , function(mc) {
sel <- rocs %>%
filter(MC == mc)
as_tibble(approx(sel$Spec, sel$Sens, seq(0, 1, 0.01))) %>%
rename(Spec = x, Sens = y) %>%
mutate(MC = mc)
})) %>%
group_by(Spec) %>%
summarise(Sens_M = median(Sens), Sens_L = quantile(Sens, .025), Sens_U = quantile(Sens, .975))
# Calculate area under ROC curves
auc <- rocs %>%
group_by(MC) %>%
summarise(AUC = sum((Sens + lag(Sens))[-1] * diff(Spec)) / 2)
auc_mci <- (auc %>% arrange(AUC))[round(quantile(1:itermax, c(0.025, 0.5, 0.975))), ]
# Keep curves at median and 95% credible interval
rocs_mci <- rocs %>%
filter(MC %in% auc_mci$MC) %>%
mutate(
Index = case_when(
MC == auc_mci$MC[1] ~ "L",
MC == auc_mci$MC[2] ~ "M",
MC == auc_mci$MC[3] ~ "U"
)
) %>%
arrange(MC, Cut) %>%
select(-MC) %>%
pivot_wider(values_from = c(Sens, Spec), names_from = Index)
list(
ROC = rocs,
AUC = auc,
ROC_mci = rocs_mci,
ROC_mci_sens = rocs_mci_sens,
ROC_mci_spec = rocs_mci_spec
)
}
This is slow and memory intensive, so commented out here.
# selected <- 1:inputs$N # all samples
#
# ### itermax: size of posterior to be used in the calculation
#
# CAD <- summarise_cad(posterior = x, inputs = inputs, keys = selected, itermax = itermax)
#
# saveRDS(CAD, "CAD.rds")
#read in the summarised chains
CAD <- read_rds(here("CAD.rds"))
roc <- CAD$ROC
roc_mci_sens <- CAD$ROC_mci_sens
roc_mci_spec <- CAD$ROC_mci_spec
roc_mci <- CAD$ROC_mci
auc <- CAD$AUC
auc %>% summarise(mean_auc = mean(AUC),
q2.5_AUC = quantile(AUC, probs = 0.025),
q97.5_AUC = quantile(AUC, probs = 0.975))
Summarise and plot the overall sensitivity and specificity by CAD4TBv6 threshold
roc_mci_sens %>%
filter(Sens >0.94 & Sens <0.96)
#CAD score = 55
roc_mci_sens %>%
filter(Sens >0.89 & Sens<0.91)
#CAD score = 61
sens_plot <- CAD$ROC %>%
group_by(Cut) %>%
summarise(
Sens_M = median(Sens),
Sens_L = quantile(Sens, 0.025),
Sens_U = quantile(Sens, 0.975)
) %>%
ggplot() +
geom_rect(data=co_all %>% filter(Index=="SensCO"),
aes(ymin=L,ymax=U, xmin=0, xmax=100,
fill="#1B9E77"), alpha=0.3) +
geom_hline(data= co_all %>% filter(Index=="SensCO"),
aes(yintercept=M,
colour="#1B9E77")) +
geom_segment(aes(y=0.95, yend=0.95,
x=0, xend=55, alpha=0.99), colour="#EFA00B", linetype=5)+
geom_segment(aes(x=55, xend=55,
y=0, yend=0.95,alpha=0.99), colour="#EFA00B", linetype=5) +
geom_segment(aes(y=0.90, yend=0.90,
x=0, xend=61, alpha=0.98), colour="#D65108", linetype=5) +
geom_segment(aes(x=61, xend=61,
y=0, yend=0.90, alpha=0.98), colour="#D65108", linetype=5) +
geom_ribbon(aes(x=Cut*100, ymin=Sens_L, ymax = Sens_U, fill="#0D324D"), alpha=0.3) +
geom_line(aes(x=Cut*100, y=Sens_M, colour="#0D324D")) +
theme_ggdist() +
labs(x="CAD4TBv6 threshold",
y="Sensitivity (95% CrI)") +
scale_y_continuous(expand = c(0, 0),
labels = scales::percent_format(accuracy = 1),
breaks = c(0,0.25, 0.44, 0.50,0.75,0.90,0.95,1.0)) +
scale_x_continuous(expand = c(0, 0), breaks = c(0,25,55,61,75,99)) +
scale_fill_identity(name = "Screening test",
breaks = c("#1B9E77", "#0D324D"),
labels = c("Clinical Officer","CAD4TBv6"),
guide = "legend") +
scale_colour_identity(name = "Screening test",
breaks = c("#1B9E77", "#0D324D"),
labels = c("Clinical Officer","CAD4TBv6"),
guide = "legend") +
scale_alpha_identity(name = "Target Product Profile",
breaks = c(0.99, 0.98),
labels = c("Optimal","Minimal"),
guide=guide_legend(override.aes =
list(colour=c("#EFA00B", "#D65108"))))
sens_plot
Specificity
roc_mci_spec %>%
filter(Spec >0.79 & Spec <0.81)
#CAD score=53
roc_mci_spec %>%
filter(Spec >0.69 & Spec<0.71)
#CAD score = 47
spec_plot <- CAD$ROC %>%
group_by(Cut) %>%
summarise(
Spec_M = median(Spec),
Spec_L = quantile(Spec, 0.025),
Spec_U = quantile(Spec, 0.975)
) %>%
ggplot() +
geom_rect(data=co_all %>% filter(Index=="SpecCO"),
aes(ymin=L,ymax=U, xmin=0, xmax=100,
fill="#1B9E77"), alpha=0.3) +
geom_hline(data= co_all %>% filter(Index=="SpecCO"),
aes(yintercept=M,
colour="#1B9E77")) +
geom_segment(aes(y=0.80, yend=0.80,
x=0, xend=53, alpha=0.99), colour="#EFA00B", linetype=5)+
geom_segment(aes(x=53, xend=53,
y=0, yend=0.80,alpha=0.99), colour="#EFA00B", linetype=5) +
geom_segment(aes(y=0.70, yend=0.70,
x=0, xend=47, alpha=0.98), colour="#D65108", linetype=5) +
geom_segment(aes(x=47, xend=47,
y=0, yend=0.70, alpha=0.98), colour="#D65108", linetype=5) +
geom_ribbon(aes(x=Cut*100, ymin=Spec_L, ymax = Spec_M, fill="#0D324D"), alpha=0.3) +
geom_line(aes(x=Cut*100, y=Spec_M, colour="#0D324D")) +
theme_ggdist() +
labs(x="CAD4TBv6 threshold",
y="Sensitivity (95% CrI)") +
scale_y_continuous(expand = c(0, 0),
labels = scales::percent_format(accuracy=1),
breaks = c(0,0.25,0.50,0.70,0.80,0.89,1.0)) +
scale_x_continuous(expand = c(0, 0), breaks = c(0,25,47,53,75,99)) +
scale_fill_identity(name = "Screening test",
breaks = c("#1B9E77", "#0D324D"),
labels = c("Clinical Officer","CAD4TBv6"),
guide = "legend") +
scale_colour_identity(name = "Screening test",
breaks = c("#1B9E77", "#0D324D"),
labels = c("Clinical Officer","CAD4TBv6"),
guide = "legend") +
scale_alpha_identity(name = "Target Product Profile",
breaks = c(0.99, 0.98),
labels = c("Optimal","Minimal"),
guide=guide_legend(override.aes =
list(colour=c("#EFA00B", "#D65108"))))
spec_plot
Combine together for Figure 3.
f3 <- sens_plot / spec_plot
f3 + plot_layout(guides = "collect")
ggsave("fig3.png", height = 8)
## Saving 7 x 8 in image
Function to summarise stratified accuracy estimates.
cad_selector <- function(data=dat, Male, Cough2w, Before, Age1, Age2, Age3){
data %>%
mutate(id=row_number()) %>%
filter(Male=={{Male}} & Cough2w=={{Cough2w}} & Before=={{Before}}
& Age1=={{Age1}} & Age2=={{Age2}} & Age3=={{Age3}}) %>%
pull(id)
}
selected <- crossing(Male=dat$Male, Cough2w=dat$Cough2w,
Before=dat$Before, Age1=dat$Age1,
Age2=dat$Age2, Age3=dat$Age3) %>%
filter(Age1 + Age2 + Age3 == 1) %>%
mutate(selected = pmap(list(Male, Cough2w, Before, Age1, Age2, Age3),
~ cad_selector(Male = ..1,
Cough2w = ..2,
Before = ..3,
Age1 = ..4,
Age2 = ..5,
Age3 = ..6)))
itermax <- 600
##Stratified by participant characteristics
##This is very memory intensive, so commented out here.
# CAD_stratifed <- selected %>%
# mutate(id=1:n()) %>%
# rowwise() %>%
# mutate(
# sum = list(summarise_cad(posterior = posterior, inputs = inputs, keys = selected, itermax = itermax)))
#
# write_rds(CAD_stratifed, here("CAD_stratified.rds"))
Estimate stratified accuracy
CAD_stratified <- read_rds(here("CAD_stratified.rds"))
sums_stratified <- CAD_stratified %>%
hoist(sum,
ROC="ROC",
AUC="AUC",
ROC_mci = "ROC_mci"
)
rocs_stratified <- sums_stratified %>%
select(Male, Cough2w, ROC, Age1, Age2, Age3, Before) %>%
unnest() %>%
mutate(Test = "CAD4TB")
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(ROC)`
aucs_stratified <- sums_stratified %>%
select(Male, Cough2w, AUC, Age1, Age2, Age3, Before) %>%
unnest()%>%
mutate(Test = "CAD4TB")
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(AUC)`
aucs_sum_stratified <- aucs_stratified %>%
summarise(M = median(AUC), L = quantile(AUC, 0.025), U = quantile(AUC, 0.975)) %>%
as.list()
Plot the stratified estimates of accuracy (Figure 4)
z <- rocs_stratified %>%
pivot_longer(c(Sens, Spec), names_to = "Index") %>%
mutate(Cut = as.numeric(scales::number(Cut*100, accuracy = 1))) %>%
mutate_at(.vars = vars(Male, Cough2w, Age1, Age2, Age3, Before),
.funs = funs(case_when(
.==0 ~ "No",
.==1 ~ "Yes"
))) %>%
mutate(Age = case_when(Age1=="No" & Age2=="No" & Age3=="Yes" ~ "41+ years",
Age1=="No" & Age2=="Yes" & Age3=="No" ~ "26-40 years",
Age1=="Yes" & Age2=="No" & Age3=="No" ~ "15-25 years")) %>%
mutate(Index = case_when(Index=="Sens" ~ "Sensitivity",
Index=="Spec" ~ "Specificity")) %>%
mutate(Male = case_when(Male=="No" ~ "Women",
Male=="Yes" ~ "Men"))
z_h <- z %>%
mutate_at(.vars = vars(Male, Cough2w, Age, Before),
.funs = funs(factor)) %>%
pivot_wider(names_from = Index,
values_from = value)
z2 <- z %>%
filter(Cut==55) %>%
group_by(Male, Cough2w, Index, Before, Age, Cut) %>%
summarise(mean = mean(value),
l = quantile(value, probs=0.025),
u = quantile(value, probs=0.975)) %>%
mutate(posx = 90,
posy = 0.75) %>%
rename(Sex = Male)
## `summarise()` has grouped output by 'Male', 'Cough2w', 'Index', 'Before', 'Age'. You can override using the `.groups` argument.
z2 <- z2 %>%
mutate(mci = paste0(scales::percent(mean, accuracy = 1), " (", scales::percent(l, accuracy = 1), "-", scales::percent(u, accuracy = 1), ")"))
sens_plot2 <- z_h %>%
group_by(Cut, Male, Cough2w, Before, Age) %>%
rename(`Cough>2w` = Cough2w,
`Previous TB` = Before,
Sex = Male) %>%
summarise(Sens_M = mean(Sensitivity),
Sens_L = quantile(Sensitivity, probs=0.025),
Sens_U = quantile(Sensitivity, probs=0.975)) %>%
ggplot() +
geom_line(aes(x=Cut, y=Sens_M), colour="darkorchid") +
geom_ribbon(aes(x=Cut, ymin=Sens_L, ymax=Sens_U), alpha=0.3, fill="darkorchid") +
geom_segment(data=z2 %>% filter(Index=="Sensitivity")%>%
rename(`Cough>2w` = Cough2w,
`Previous TB` = Before), aes(x=0, xend=Cut, y=mean, yend=mean), linetype="dashed") +
geom_segment(data=z2 %>% filter(Index=="Sensitivity")%>%
rename(`Cough>2w` = Cough2w,
`Previous TB` = Before), aes(x=Cut, xend=Cut, y=0, yend=mean), linetype="dashed") +
geom_text(data=z2 %>% filter(Index=="Sensitivity")%>%
rename(`Cough>2w` = Cough2w,
`Previous TB` = Before), aes(x=50, y=1.15, label=mci),
family="Open Sans Condensed Light") +
scale_y_continuous("Sensitivity (95% CrI)", labels = scales::percent, breaks = c(0,0.25,0.50,0.75,1)) +
facet_grid(Age ~ Sex + `Cough>2w` + `Previous TB`, scales = "free_x",
labeller = labeller(.cols = label_both)) +
theme_ggdist()+
scale_x_continuous("CAD4TBv6 score", breaks = c(0,25, 55, 75, 99)) +
theme(strip.background =element_rect(fill=NA, colour="darkgrey"))
## `summarise()` has grouped output by 'Cut', 'Sex', 'Cough>2w', 'Previous TB'. You can override using the `.groups` argument.
spec_plot2 <-z_h %>%
group_by(Cut, Male, Cough2w, Before, Age) %>%
rename(`Cough>2w` = Cough2w,
`Previous TB` = Before,
Sex = Male) %>%
summarise(Spec_M = mean(Specificity),
Spec_L = quantile(Specificity, probs=0.025),
Spec_U = quantile(Specificity, probs=0.975)) %>%
ggplot() +
geom_line(aes(x=Cut, y=Spec_M), colour="mediumseagreen") +
geom_ribbon(aes(x=Cut, ymin=Spec_L, ymax=Spec_U), alpha=0.3, fill="mediumseagreen") +
geom_segment(data=z2 %>% filter(Index=="Specificity")%>%
rename(`Cough>2w` = Cough2w,
`Previous TB` = Before), aes(x=0, xend=Cut, y=mean, yend=mean), linetype="dashed") +
geom_segment(data=z2 %>% filter(Index=="Specificity")%>%
rename(`Cough>2w` = Cough2w,
`Previous TB` = Before), aes(x=Cut, xend=Cut, y=0, yend=mean), linetype="dashed") +
geom_text(data=z2 %>% filter(Index=="Specificity") %>%
rename(`Cough>2w` = Cough2w,
`Previous TB` = Before), aes(x=50, y=1.15, label=mci),
family="Open Sans Condensed Light") +
scale_y_continuous("Specificity (95% CrI)", labels = scales::percent, breaks = c(0,0.25,0.50,0.75,1)) +
facet_grid(Age ~ Sex + `Cough>2w` + `Previous TB`, scales = "free_x",
labeller = labeller(.cols = label_both)) +
theme_ggdist()+
scale_x_continuous("CAD4TBv6 score", breaks = c(0,25, 55, 75, 99))+
theme(strip.background =element_rect(fill=NA, colour="darkgrey"))
## `summarise()` has grouped output by 'Cut', 'Sex', 'Cough>2w', 'Previous TB'. You can override using the `.groups` argument.
sens_plot2 / spec_plot2
ggsave("fig4.png", height=12, width=10)