epiforecasts · kathsherratt · Feb 18, 2026 · Feb 19, 2026
diff --git a/R/analysis-descriptive.R b/R/analysis-descriptive.R
@@ -47,7 +47,6 @@ table_confint <- function(scores, group_var = NULL) {
     ) |>
     mutate(
       Models = paste0(n_models, " (", p_models, "%)"),
-      Forecasts = paste0(n_forecasts, " (", p_forecasts, "%)"),
       "Mean WIS (SD)" = paste0(round(mean, 2), " (", round(sd, 2), ")")
     )
 
@@ -99,7 +98,6 @@ print_table1 <- function(scores) {
     select(
       Variable,
       starts_with("Models_"),
-      starts_with("Forecasts_"),
       starts_with("Mean WIS (SD)_")
     )
   ## reorder
@@ -122,7 +120,8 @@ print_table1 <- function(scores) {
         "the European COVID-19 Forecast Hub, March 2021-2023. ",
         "Forecast performance was measured using the weighted ",
         "interval score (WIS), with a lower score indicating a more ",
-        "accurate forecast."
+        "accurate forecast. ",
+        "Numbers in brackets denote standard deviations."
       ),
       col.names = str_remove(colnames(table1), "_.*$"),
       align = c("l", rep("r", ncol(table1) - 1))
@@ -164,8 +163,8 @@ plot_over_time <- function(scores, ensemble, add_plot, show_uncertainty = TRUE)
     scale_x_date(date_labels = "%b %Y") +
     scale_fill_manual(
       values = c(
-        "Single-country" = "#e7298a",
-        "Multi-country" = "#e6ab02"
+        "Single-country" = "#1b9e77",
+        "Multi-country" = "#d95f02"
       ),
       aesthetics = c("col", "fill")
     ) +
@@ -201,7 +200,7 @@ plot_over_time <- function(scores, ensemble, add_plot, show_uncertainty = TRUE)
     scale_x_date(date_labels = "%b %Y") +
     scale_fill_brewer(
       aesthetics = c("col", "fill"),
-      type = "qual", palette = 2
+      type = "qual", palette = "Set2"
     ) +
     labs(
       x = NULL, y = "Mean WIS (log scale)",

diff --git a/R/plot-model-results.R b/R/plot-model-results.R
@@ -43,6 +43,10 @@ plot_models <- function(random_effects, scores, x_labels = TRUE,
                      position = position_dodge(width=1)) +
       geom_hline(yintercept = 0, lty = 2) +
       labs(y = "Partial effect", x = "", colour = NULL, shape = NULL) +
+      scale_shape_manual(
+        values = c("Single-country" = 16, "Multi-country" = 17),
+        drop = FALSE
+      ) +
       scale_alpha_manual(values = c("Adjusted" = 1, "Unadjusted" = 0.3)) +
       scale_colour_brewer(type = "qual", palette = 2) +
       facet_wrap(~epi_target, scales = "free_y", drop = TRUE) +

diff --git a/report/results.qmd b/report/results.qmd
@@ -1,6 +1,6 @@
 ---
 title: "Results"
-output: html_document
+format: html
 ---
 <!--- RMarkdown set up --->
 ```{r set-up, include=FALSE}
@@ -117,20 +117,23 @@ Only `r nrow(multi_targets |> filter(consistent))` models consistently forecast
 We explored the interval score (WIS) as a measure of predictive performance (Figure \@ref(fig:scores-over-time)), and characterised its association with model structure and number of countries targeted. 
 We used descriptive statistics and an unadjusted univariate model for each explanatory variable. 
 
-```{r scores-over-time, fig.height=8,fig.width=6,fig.cap=scores_over_time_cap}
-scores_over_time_cap <- "Predictive accuracy of multiple models' forecasts for COVID-19 cases and deaths across 32 European countries over time. Forecast performance is shown as the mean weighted interval score (WIS), where a lower score indicates better performance. Forecast performance is summarised across 32 target locations and 1 through 4 week forecast horizons, with varying numbers of forecasters participating over time. Shown for (A) the method structure used by each model; (B) the number of countries each model targeted (one or multiple); with (C) the total count of observed incidence across all 32 countries, shown on the log scale."
+```{r scores-over-time, fig.height=6,fig.width=6,fig.cap=scores_over_time_cap}
+scores_over_time_cap <- "Predictive accuracy of multiple models' forecasts for COVID-19 cases and deaths across 32 European countries over time. Forecast performance is shown as the unadjusted mean weighted interval score (WIS), where a lower score indicates better performance. Forecast performance is summarised across 32 target locations and 1 through 4 week forecast horizons, with varying numbers of forecasters participating over time. Shown for (A) the method structure used by each model; (B) the number of countries each model targeted (one or multiple). Unadjusted means are shown; see Figure 2 for adjusted estimates accounting for confounding factors."
 scores_over_time <- plot_over_time(
   scores = scores,
   ensemble = ensemble,
-  add_plot = data_plot(scores, log = TRUE),
   show_uncertainty = FALSE
 )
 scores_over_time
 ```
 
 ```{r}
 results <- readRDS(here("output", "results.rds"))
-table_effects <- results$effects |> 
+# Handle column name difference between saved results and current code
+if ("outcome_target" %in% names(results$effects)) {
+  results$effects <- rename(results$effects, epi_target = outcome_target)
+}
+table_effects <- results$effects |>
   mutate(upper_97.5_text = if_else(
            upper_97.5 < 0,
            paste0("(", round(upper_97.5, 2), ")"),
@@ -198,7 +201,7 @@ plot_effects(results$effects, variables = c("Method", "CountryTargets"))
 
 
 ```{r plot-models, fig.height=8, fig.width=10, fig.cap=model_cap}
-model_cap <- "Partial effect size (95% CI) by model. This can be interpreted as adjusted performance after accounting for all other variables in the model, with remaining differences in effects as seen here representing unexplained variation between models beyond these factors."
+model_cap <- "Partial effect size (95% CI) by individual model, shown by method classification (colour) and number of countries targeted (shape: circle = single-country, triangle = multi-country). Solid lines show adjusted estimates; faded lines show unadjusted estimates. This can be interpreted as adjusted performance after accounting for all other variables in the model, with remaining differences in effects as seen here representing unexplained variation between models beyond these factors."
 plot_models(results$effects, scores)
 ```
 
@@ -209,3 +212,19 @@ In developing this work, we explored several alternative model specifications. W
 We considered different ways of categorising explanatory variables, 
 
 and sensitivity analyses (see Supplementary Results).
+
+```{r export-figures, eval=FALSE, echo=FALSE}
+# Export figures in TIF format for journal submission (PLOS Comp Bio)
+# Uncomment and run manually to generate submission-ready figures
+# ggsave(here("output", "plots", "Figure1_scores_over_time.tif"),
+#        scores_over_time, width = 6, height = 6, dpi = 300,
+#        device = "tiff", compression = "lzw")
+# ggsave(here("output", "plots", "Figure2_partial_effects.tif"),
+#        plot_effects(results$effects, variables = c("Method", "CountryTargets")),
+#        width = 5, height = 3, dpi = 300,
+#        device = "tiff", compression = "lzw")
+# ggsave(here("output", "plots", "Figure3_model_effects.tif"),
+#        plot_models(results$effects, scores),
+#        width = 10, height = 8, dpi = 300,
+#        device = "tiff", compression = "lzw")
+```