epiforecasts · kathsherratt · Feb 19, 2026 · Feb 18, 2026 · Feb 18, 2026 · Feb 19, 2026
diff --git a/R/analysis-model.R b/R/analysis-model.R
@@ -19,6 +19,7 @@ library(purrr)
 library(mgcv)
 library(gammit)
 library(gratia)
+library(ggplot2)
 source(here("R", "process-data.R"))
 
 model_wis <- function(scoring_scale = "log", output_dir = "output") {

diff --git a/R/utils-variants.R b/R/utils-variants.R
@@ -73,7 +73,7 @@ classify_variant_phases <- function() {
   # set a complete grid of dates and locations in ecdc year-weeks
   date_grid <- tibble(
     target_end_date = seq.Date(from = as.Date("2020-01-04"),
-                               to = Sys.Date(), by = 7),
+                               to = as.Date("2023-03-17"), by = 7),
     year = isoyear(target_end_date),
     week = isoweek(target_end_date)) |>
     group_by(year, week) |>
@@ -102,53 +102,65 @@ classify_variant_phases <- function() {
 
 # Identify dominant variant in each week and location
 set_variant_phases <- function(variant_data, date_location) {
+  # Aggregate variant percentages: first sum sub-variants within each source
+  # (e.g., BA.4 + BA.5 → Omicron-BA.4/5), then average across sources
   dominant <- variant_data |>
-    group_by(location, target_end_date, source) |>
+    filter(!is.na(variant_percent)) |>
+    group_by(location, target_end_date, source, variant_name) |>
+    summarise(variant_percent = sum(variant_percent), .groups = "drop") |>
+    group_by(location, target_end_date, variant_name) |>
+    summarise(variant_percent = mean(variant_percent), .groups = "drop") |>
+    group_by(location, target_end_date) |>
     slice_max(order_by = variant_percent, with_ties = FALSE, n = 1) |>
     mutate(variant_name = as.character(variant_name)) |>
     ungroup()
 
-  # # use single sequential phases for each location -------
-  # dominant_phases <- dominant |>
-  #   mutate(dominant_name = ifelse(variant_name == "Other", NA,
-  #                                 variant_name)) |>
-  #   filter(!is.na(dominant_name)) |>
-  #   # get only one first date for each dominant variant in each location
-  #   group_by(location) |>
-  #   arrange(target_end_date) |>
-  #   group_by(dominant_name, .add = TRUE) |>
-  #   summarise(target_end_date = first(target_end_date), .groups = "drop") |>
-  #   # expand out to all weeks
-  #   right_join(date_location,
-  #              by = c("location", "target_end_date")) |>
-  #   group_by(location) |>
-  #   arrange(target_end_date) |>
-  #   fill(dominant_name, .direction = "downup") |>
-  #   mutate(VariantPhase = factor(dominant_name)) |>
-  #   select(location, target_end_date, VariantPhase)
-
-  # average date ------------------------------------------------------------
-  # TODO REMOVE - HACK
-  # get median average first date for each dominant variant across all locations
-  dominant_start_mid <- dominant |>
+  # Expected chronological order of variant phases
+  variant_order <- c("Alpha", "Delta", "Omicron-BA.1", "Omicron-BA.2",
+                     "Omicron-BA.4/5", "Omicron-BQ/XBB")
+
+  # Use single sequential phases for each location
+  # Find first date each named variant was dominant (>50%) in each location
+  phase_starts <- dominant |>
     mutate(dominant_name = ifelse(variant_name == "Other", NA,
                                   variant_name)) |>
-    filter(!is.na(dominant_name)) |>
+    filter(!is.na(dominant_name), variant_percent > 50) |>
     group_by(location) |>
     arrange(target_end_date) |>
     group_by(dominant_name, .add = TRUE) |>
     summarise(target_end_date = first(target_end_date), .groups = "drop") |>
-    group_by(dominant_name) |>
-    summarise(target_end_date = median(target_end_date))
-  dominant_mid <- date_location |>
-    left_join(dominant_start_mid, by = "target_end_date") |>
+    # Enforce chronological ordering: remove out-of-sequence phases
+    mutate(variant_rank = match(dominant_name, variant_order)) |>
+    filter(!is.na(variant_rank)) |>
+    group_by(location) |>
+    arrange(target_end_date) |>
+    filter(variant_rank == cummax(variant_rank)) |>
+    select(-variant_rank)
+
+  # Manual override: Hungary had sparse ECDC surveillance data in early 2021,
+  # causing Delta to backfill to the start of the timeseries. Alpha was dominant
+  # before Delta spread from late July 2021.
+  # Source: https://abouthungary.hu/news-in-brief/delta-and-gamma-variant-identified-in-hungary
+  hu_alpha_start <- min(date_location$target_end_date)
+  hu_delta_start <- as.Date("2021-07-31") # week following 23 July 2021
+  phase_starts <- phase_starts |>
+    # Remove any data-derived Alpha/Delta starts for HU
+    filter(!(location == "HU" & dominant_name %in% c("Alpha", "Delta"))) |>
+    bind_rows(tibble(
+      location = "HU",
+      dominant_name = c("Alpha", "Delta"),
+      target_end_date = c(hu_alpha_start, hu_delta_start)
+    ))
+
+  # Expand out to all weeks
+  dominant_phases <- phase_starts |>
+    right_join(date_location,
+               by = c("location", "target_end_date")) |>
     group_by(location) |>
     arrange(target_end_date) |>
     fill(dominant_name, .direction = "downup") |>
     mutate(VariantPhase = factor(dominant_name)) |>
     select(location, target_end_date, VariantPhase)
-  dominant_phases <- dominant_mid
-  # TODO end  ---------------------------------------------------------
 
   return(dominant_phases)
 
@@ -164,7 +176,8 @@ get_variants_ch <- function(date_grid, variant_names) {
     select(variant = variant_type,
            target_end_date = date,
            variant_percent = prct_mean7d) |>
-    filter(target_end_date %in% date_grid$target_end_date)
+    filter(target_end_date %in% date_grid$target_end_date) |>
+    mutate(source = "CH-WGS")
 
   # 2022-2023: weekly data from hospital sampling
   # https://covid19.admin.ch/api/data/20231206-0sxi4s4a/sources/COVID19Variants_hosp_w.csv
@@ -173,13 +186,15 @@ get_variants_ch <- function(date_grid, variant_names) {
     select(variant = variant_type,
            year_week,
            variant_percent = prct) |>
-    left_join(date_grid, by = c("year_week")) |>
-    filter(year_week %in% date_grid$year_week &
-             !target_end_date %in% variants_ch_wgs$target_end_date)
+    left_join(distinct(date_grid, year_week, .keep_all = TRUE),
+              by = c("year_week")) |>
+    filter(year_week %in% date_grid$year_week) |>
+    mutate(source = "CH-Hosp")
 
+  # Keep both sources; overlap is handled by set_variant_phases() which
+  # sums sub-variants within each source then averages across sources
   ch <- bind_rows(variants_ch_wgs, variants_ch_hosp) |>
-    mutate(location = "CH",
-           source = "National") |>
+    mutate(location = "CH") |>
     filter(variant != "all_sequenced") |>
     mutate(variant_name = as.character(fct_recode(variant, !!!variant_names))) |>
     select(location, source, target_end_date, variant_name, variant_percent)

diff --git a/output/plots/check_Cases.pdf b/output/plots/check_Cases.pdf
diff --git a/output/plots/check_Deaths.pdf b/output/plots/check_Deaths.pdf
diff --git a/output/results.rds b/output/results.rds
diff --git a/report/Revision_manuscript.md b/report/Revision_manuscript.md
@@ -70,7 +70,7 @@ We also classified models by geographic target specificity. We assessed the tota
 
 *Confounding factors associated with epidemiological context*
 
-We considered aspects of the forecasting context that we believed could influence associations with forecast performance beyond the effects of interest. Among models, forecast performance typically declines with longer forecast horizons. We also considered the wider epidemiological setting of the forecast target. This included the trend of incidence in the target week for each location, which we categorised as “Stable”, “Decreasing”, or “Increasing”, based on the difference over a three-week moving average of incidence (with a change of \+/-5% as “Stable”; see Supplement). We also considered differing transmission characteristics with each Sars-Cov2 variant. We used publicly available sequence data to classify each target location-week according to the dominant circulating variant, creating a sequence of phases (see Supplement) (\#ref-variant-data). We considered extraneous variation at the country level, to account for, for example, effects of population size or different transmission probabilities with national policy.
+We considered aspects of the forecasting context that we believed could influence associations with forecast performance beyond the effects of interest. Among models, forecast performance typically declines with longer forecast horizons. We also considered the wider epidemiological setting of the forecast target. This included the trend of incidence in the target week for each location, which we categorised as “Stable”, “Decreasing”, or “Increasing”, based on the difference over a three-week moving average of incidence (with a change of \+/-5% as “Stable”; see Supplement). We also considered differing transmission characteristics with each SARS-CoV-2 variant. We used publicly available genomic surveillance data from ECDC, UKHSA, and the Swiss Federal Office of Public Health to classify each target location-week according to the dominant circulating variant, assigning sequential variant phases (Alpha, Delta, Omicron BA.1, BA.2, BA.4/5, BQ/XBB) separately for each country based on when each variant first exceeded 50% of sequenced samples (see Supplement for details). We considered extraneous variation at the country level, to account for, for example, effects of population size or different transmission probabilities with national policy.
 
 **Analysis**
 

diff --git a/report/results.qmd b/report/results.qmd
@@ -140,8 +140,8 @@ table_effects <- results$effects |>
            paste0("(", round(upper_97.5, 2), ")"),
            as.character(round(upper_97.5, 2))
          )) |>
-  mutate(value_ci = paste0(round(value, 2), 
-                           " (", round(lower_2.5, 2), "-", 
+  mutate(value_ci = paste0(round(value, 2),
+                           " (", round(lower_2.5, 2), "-",
                            upper_97.5_text, ")"),
          group = paste(epi_target, model, group, sep = "_")) |>
   column_to_rownames("group")

diff --git a/report/supplement/Supplement.Rmd b/report/supplement/Supplement.Rmd
@@ -121,7 +121,7 @@ scores |>
 
 ```{r variant-phases, fig.cap="Variant phases identified by dominant variant in each location and week"}
 # see: R/utils-variants.R
-variant_phases <- classify_variant_phases() 
+variant_phases <- classify_variant_phases()
 variant_phases |>
     ggplot(aes(x = target_end_date, y = location, fill = VariantPhase)) +
     geom_tile() +
@@ -131,6 +131,8 @@ variant_phases |>
     theme(legend.position = "bottom")
 ```
 
+Genomic surveillance data were obtained from three sources: ECDC (covering 30 European countries), UKHSA (Great Britain), and the Swiss Federal Office of Public Health (Switzerland). Variant lineages were mapped to six named phases in expected chronological order: Alpha, Delta, Omicron-BA.1, Omicron-BA.2, Omicron-BA.4/5, and Omicron-BQ/XBB. For each country, we identified the first week in which each named variant exceeded 50% of sequenced samples. We enforced chronological ordering by removing any out-of-sequence phases, then expanded phase assignments to all weeks by filling forward and backward from observed transition dates. This per-location approach accounts for the fact that variant dominance dates differed substantially across European countries. Where genomic surveillance data were too sparse to identify a transition (Hungary), we supplemented with epidemiological reports to set the Alpha-to-Delta transition date.
+
 \newpage
 
 ## Model fitting 

diff --git a/report/supplement/Supplement.pdf b/report/supplement/Supplement.pdf