Skip to content

Commit 3875105

Browse files
authored
Merge pull request #144 from epiforecasts/pr2-fix-variant-code
Fix variant phase assignment with per-location approach
2 parents 282cb0d + 3e1c341 commit 3875105

9 files changed

Lines changed: 60 additions & 42 deletions

File tree

R/analysis-model.R

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ library(purrr)
1919
library(mgcv)
2020
library(gammit)
2121
library(gratia)
22+
library(ggplot2)
2223
source(here("R", "process-data.R"))
2324

2425
model_wis <- function(scoring_scale = "log", output_dir = "output") {

R/utils-variants.R

Lines changed: 53 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ classify_variant_phases <- function() {
7373
# set a complete grid of dates and locations in ecdc year-weeks
7474
date_grid <- tibble(
7575
target_end_date = seq.Date(from = as.Date("2020-01-04"),
76-
to = Sys.Date(), by = 7),
76+
to = as.Date("2023-03-17"), by = 7),
7777
year = isoyear(target_end_date),
7878
week = isoweek(target_end_date)) |>
7979
group_by(year, week) |>
@@ -102,53 +102,65 @@ classify_variant_phases <- function() {
102102

103103
# Identify dominant variant in each week and location
104104
set_variant_phases <- function(variant_data, date_location) {
105+
# Aggregate variant percentages: first sum sub-variants within each source
106+
# (e.g., BA.4 + BA.5 → Omicron-BA.4/5), then average across sources
105107
dominant <- variant_data |>
106-
group_by(location, target_end_date, source) |>
108+
filter(!is.na(variant_percent)) |>
109+
group_by(location, target_end_date, source, variant_name) |>
110+
summarise(variant_percent = sum(variant_percent), .groups = "drop") |>
111+
group_by(location, target_end_date, variant_name) |>
112+
summarise(variant_percent = mean(variant_percent), .groups = "drop") |>
113+
group_by(location, target_end_date) |>
107114
slice_max(order_by = variant_percent, with_ties = FALSE, n = 1) |>
108115
mutate(variant_name = as.character(variant_name)) |>
109116
ungroup()
110117

111-
# # use single sequential phases for each location -------
112-
# dominant_phases <- dominant |>
113-
# mutate(dominant_name = ifelse(variant_name == "Other", NA,
114-
# variant_name)) |>
115-
# filter(!is.na(dominant_name)) |>
116-
# # get only one first date for each dominant variant in each location
117-
# group_by(location) |>
118-
# arrange(target_end_date) |>
119-
# group_by(dominant_name, .add = TRUE) |>
120-
# summarise(target_end_date = first(target_end_date), .groups = "drop") |>
121-
# # expand out to all weeks
122-
# right_join(date_location,
123-
# by = c("location", "target_end_date")) |>
124-
# group_by(location) |>
125-
# arrange(target_end_date) |>
126-
# fill(dominant_name, .direction = "downup") |>
127-
# mutate(VariantPhase = factor(dominant_name)) |>
128-
# select(location, target_end_date, VariantPhase)
129-
130-
# average date ------------------------------------------------------------
131-
# TODO REMOVE - HACK
132-
# get median average first date for each dominant variant across all locations
133-
dominant_start_mid <- dominant |>
118+
# Expected chronological order of variant phases
119+
variant_order <- c("Alpha", "Delta", "Omicron-BA.1", "Omicron-BA.2",
120+
"Omicron-BA.4/5", "Omicron-BQ/XBB")
121+
122+
# Use single sequential phases for each location
123+
# Find first date each named variant was dominant (>50%) in each location
124+
phase_starts <- dominant |>
134125
mutate(dominant_name = ifelse(variant_name == "Other", NA,
135126
variant_name)) |>
136-
filter(!is.na(dominant_name)) |>
127+
filter(!is.na(dominant_name), variant_percent > 50) |>
137128
group_by(location) |>
138129
arrange(target_end_date) |>
139130
group_by(dominant_name, .add = TRUE) |>
140131
summarise(target_end_date = first(target_end_date), .groups = "drop") |>
141-
group_by(dominant_name) |>
142-
summarise(target_end_date = median(target_end_date))
143-
dominant_mid <- date_location |>
144-
left_join(dominant_start_mid, by = "target_end_date") |>
132+
# Enforce chronological ordering: remove out-of-sequence phases
133+
mutate(variant_rank = match(dominant_name, variant_order)) |>
134+
filter(!is.na(variant_rank)) |>
135+
group_by(location) |>
136+
arrange(target_end_date) |>
137+
filter(variant_rank == cummax(variant_rank)) |>
138+
select(-variant_rank)
139+
140+
# Manual override: Hungary had sparse ECDC surveillance data in early 2021,
141+
# causing Delta to backfill to the start of the timeseries. Alpha was dominant
142+
# before Delta spread from late July 2021.
143+
# Source: https://abouthungary.hu/news-in-brief/delta-and-gamma-variant-identified-in-hungary
144+
hu_alpha_start <- min(date_location$target_end_date)
145+
hu_delta_start <- as.Date("2021-07-31") # week following 23 July 2021
146+
phase_starts <- phase_starts |>
147+
# Remove any data-derived Alpha/Delta starts for HU
148+
filter(!(location == "HU" & dominant_name %in% c("Alpha", "Delta"))) |>
149+
bind_rows(tibble(
150+
location = "HU",
151+
dominant_name = c("Alpha", "Delta"),
152+
target_end_date = c(hu_alpha_start, hu_delta_start)
153+
))
154+
155+
# Expand out to all weeks
156+
dominant_phases <- phase_starts |>
157+
right_join(date_location,
158+
by = c("location", "target_end_date")) |>
145159
group_by(location) |>
146160
arrange(target_end_date) |>
147161
fill(dominant_name, .direction = "downup") |>
148162
mutate(VariantPhase = factor(dominant_name)) |>
149163
select(location, target_end_date, VariantPhase)
150-
dominant_phases <- dominant_mid
151-
# TODO end ---------------------------------------------------------
152164

153165
return(dominant_phases)
154166

@@ -164,7 +176,8 @@ get_variants_ch <- function(date_grid, variant_names) {
164176
select(variant = variant_type,
165177
target_end_date = date,
166178
variant_percent = prct_mean7d) |>
167-
filter(target_end_date %in% date_grid$target_end_date)
179+
filter(target_end_date %in% date_grid$target_end_date) |>
180+
mutate(source = "CH-WGS")
168181

169182
# 2022-2023: weekly data from hospital sampling
170183
# https://covid19.admin.ch/api/data/20231206-0sxi4s4a/sources/COVID19Variants_hosp_w.csv
@@ -173,13 +186,15 @@ get_variants_ch <- function(date_grid, variant_names) {
173186
select(variant = variant_type,
174187
year_week,
175188
variant_percent = prct) |>
176-
left_join(date_grid, by = c("year_week")) |>
177-
filter(year_week %in% date_grid$year_week &
178-
!target_end_date %in% variants_ch_wgs$target_end_date)
189+
left_join(distinct(date_grid, year_week, .keep_all = TRUE),
190+
by = c("year_week")) |>
191+
filter(year_week %in% date_grid$year_week) |>
192+
mutate(source = "CH-Hosp")
179193

194+
# Keep both sources; overlap is handled by set_variant_phases() which
195+
# sums sub-variants within each source then averages across sources
180196
ch <- bind_rows(variants_ch_wgs, variants_ch_hosp) |>
181-
mutate(location = "CH",
182-
source = "National") |>
197+
mutate(location = "CH") |>
183198
filter(variant != "all_sequenced") |>
184199
mutate(variant_name = as.character(fct_recode(variant, !!!variant_names))) |>
185200
select(location, source, target_end_date, variant_name, variant_percent)

output/plots/check_Cases.pdf

148 KB
Binary file not shown.

output/plots/check_Deaths.pdf

203 KB
Binary file not shown.

output/results.rds

79 Bytes
Binary file not shown.

report/Revision_manuscript.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ We also classified models by geographic target specificity. We assessed the tota
7070

7171
*Confounding factors associated with epidemiological context*
7272

73-
We considered aspects of the forecasting context that we believed could influence associations with forecast performance beyond the effects of interest. Among models, forecast performance typically declines with longer forecast horizons. We also considered the wider epidemiological setting of the forecast target. This included the trend of incidence in the target week for each location, which we categorised as “Stable”, “Decreasing”, or “Increasing”, based on the difference over a three-week moving average of incidence (with a change of \+/-5% as “Stable”; see Supplement). We also considered differing transmission characteristics with each Sars-Cov2 variant. We used publicly available sequence data to classify each target location-week according to the dominant circulating variant, creating a sequence of phases (see Supplement) (\#ref-variant-data). We considered extraneous variation at the country level, to account for, for example, effects of population size or different transmission probabilities with national policy.
73+
We considered aspects of the forecasting context that we believed could influence associations with forecast performance beyond the effects of interest. Among models, forecast performance typically declines with longer forecast horizons. We also considered the wider epidemiological setting of the forecast target. This included the trend of incidence in the target week for each location, which we categorised as “Stable”, “Decreasing”, or “Increasing”, based on the difference over a three-week moving average of incidence (with a change of \+/-5% as “Stable”; see Supplement). We also considered differing transmission characteristics with each SARS-CoV-2 variant. We used publicly available genomic surveillance data from ECDC, UKHSA, and the Swiss Federal Office of Public Health to classify each target location-week according to the dominant circulating variant, assigning sequential variant phases (Alpha, Delta, Omicron BA.1, BA.2, BA.4/5, BQ/XBB) separately for each country based on when each variant first exceeded 50% of sequenced samples (see Supplement for details). We considered extraneous variation at the country level, to account for, for example, effects of population size or different transmission probabilities with national policy.
7474

7575
**Analysis**
7676

report/results.qmd

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -140,8 +140,8 @@ table_effects <- results$effects |>
140140
paste0("(", round(upper_97.5, 2), ")"),
141141
as.character(round(upper_97.5, 2))
142142
)) |>
143-
mutate(value_ci = paste0(round(value, 2),
144-
" (", round(lower_2.5, 2), "-",
143+
mutate(value_ci = paste0(round(value, 2),
144+
" (", round(lower_2.5, 2), "-",
145145
upper_97.5_text, ")"),
146146
group = paste(epi_target, model, group, sep = "_")) |>
147147
column_to_rownames("group")

report/supplement/Supplement.Rmd

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ scores |>
121121

122122
```{r variant-phases, fig.cap="Variant phases identified by dominant variant in each location and week"}
123123
# see: R/utils-variants.R
124-
variant_phases <- classify_variant_phases()
124+
variant_phases <- classify_variant_phases()
125125
variant_phases |>
126126
ggplot(aes(x = target_end_date, y = location, fill = VariantPhase)) +
127127
geom_tile() +
@@ -131,6 +131,8 @@ variant_phases |>
131131
theme(legend.position = "bottom")
132132
```
133133

134+
Genomic surveillance data were obtained from three sources: ECDC (covering 30 European countries), UKHSA (Great Britain), and the Swiss Federal Office of Public Health (Switzerland). Variant lineages were mapped to six named phases in expected chronological order: Alpha, Delta, Omicron-BA.1, Omicron-BA.2, Omicron-BA.4/5, and Omicron-BQ/XBB. For each country, we identified the first week in which each named variant exceeded 50% of sequenced samples. We enforced chronological ordering by removing any out-of-sequence phases, then expanded phase assignments to all weeks by filling forward and backward from observed transition dates. This per-location approach accounts for the fact that variant dominance dates differed substantially across European countries. Where genomic surveillance data were too sparse to identify a transition (Hungary), we supplemented with epidemiological reports to set the Alpha-to-Delta transition date.
135+
134136
\newpage
135137

136138
## Model fitting

report/supplement/Supplement.pdf

10.8 KB
Binary file not shown.

0 commit comments

Comments
 (0)