rstats-tartu
last update: 2021-02-16 21:15:21
Dataset
Worldwide cases and deaths
Cases and deaths on relative time
scale
Risk of death
COVID-19 cases in Estonia
Estonian COVID-19 tests handling
Small selection of graphs illustrating daily developments in COVID-19 epidemic. Code is shown on purpose, in case you want to recreate these plots.
Phylogenetic- and geographic distribution of SARS-CoV-2, COVID-19 causing virus, is available on https://auspice.credibleinterval.ee/sarscov2.
SARS-Cov-2 phylogenetic tree is based solely on sequences published on NCBI https://www.ncbi.nlm.nih.gov/genbank/sars-cov-2-seqs/
Daily COVID-19 worldwide data is from European Centre for Disease Prevention and Control.
Estonian COVID-19 dataset was downloaded from Estonian Healthboard https://opendata.digilugu.ee/opendata_covid19_test_results.csv
Datasets were downloaded using scripts/get_data.R
script and saved to data
folder.
Report was rendered using rmarkdown::render()
command. To render, you
can run following command from shell:
Rscript -e "rmarkdown::render('scripts/main.R', output_format = rmarkdown::github_document(), output_file = 'README.md')"
Loading libraries.
pkg <- c("dplyr", "tidyr", "readr", "lubridate", "here", "ggplot2", "directlabels", "tibbletime")
invisible(lapply(pkg, library, character.only = TRUE))
eu <- read_csv("https://datahub.io/opendatafortaxjustice/listofeucountries/r/listofeucountries.csv") %>%
rename(country = x)
Importing downloaded ECDC daily COVID-19 dataset.
path <- here("data/COVID-19-geographic-distribution-worldwide.csv")
covid <- read_csv(path, col_types = cols(daterep = col_date(format = "%d/%m/%Y")))
Resetting timeline to days since first case in each country.
covid_by_country <- covid %>%
rename(cases = casesweekly, deaths = deathsweekly) %>%
filter(cases != 0, deaths != 0) %>%
group_by(country) %>%
mutate(tp = interval(Sys.Date(), daterep) / ddays(1),
tp = tp - min(tp)
)
Calculating number of cases and deaths per country. Keeping only informative rows.
lag_n <- 1
covid_cum <- covid_by_country %>%
mutate(cum_cases = with_order(tp, cumsum, cases),
cum_deaths = with_order(tp, cumsum, deaths),
risk = cum_deaths / cum_cases,
risk_lag = cum_deaths / lag(cum_cases, n = lag_n, order_by = tp)) %>%
ungroup()
cumlong <- covid_by_country %>%
group_by(daterep) %>%
summarise_at(c("cases", "deaths"), sum) %>%
mutate_at(c("cases", "deaths"), cumsum) %>%
pivot_longer(cases:deaths)
cumlong %>%
ggplot(aes(daterep, value, linetype = name)) +
geom_line() +
geom_dl(data = cumlong %>%
group_by(name) %>%
filter(name %in% c("cases", "deaths"), value == max(value)),
aes(label = prettyNum(value, big.mark = ",")),
method = list("last.points", hjust = 1.05, vjust = -0.3)) +
labs(x = "Date",
y = "Number of cases or deaths",
title = "Global cases and deaths") +
scale_y_continuous(limits = c(0, max(cumlong$value) * 1.1)) +
scale_linetype_discrete(labels = c("Cases", "Deaths")) +
theme(legend.title = element_blank(),
legend.position = "bottom")
COVID-19 cases worldwide by country.
top_10_cases <- covid_cum %>%
group_by(country) %>%
summarise_at("cum_cases", max) %>%
top_n(10, cum_cases) %>%
pull(country)
top_10_deaths <- covid_cum %>%
group_by(country) %>%
summarise_at("cum_deaths", max) %>%
top_n(10, cum_deaths) %>%
pull(country)
covid_cum %>%
ggplot(aes(daterep, cum_cases)) +
geom_line(aes(group = country, color = country %in% top_10_cases)) +
geom_dl(aes(label = geoid, color = country %in% top_10_cases), method = list("last.points", cex = 0.8)) +
scale_y_log10() +
scale_color_manual(values = c("gray", "black")) +
theme(legend.position = "none") +
labs(title = "Cases",
x = "Date",
y = "Cumulative number of cases",
caption = "Each line represents one country.\nTop 10 is shown in black.")
COVID-19 deaths worldwide.
covid_cum %>%
ggplot(aes(daterep, cum_deaths)) +
geom_line(aes(group = country, color = country %in% top_10_deaths)) +
geom_dl(aes(label = geoid, color = country %in% top_10_deaths), method = list("last.points", cex = 0.8)) +
scale_y_log10() +
scale_color_manual(values = c("gray", "black")) +
theme(legend.position = "none") +
labs(title = "Deaths",
x = "Date",
y = "Cumulative number of deaths",
caption = "Each line represents one country.\nTop 10 is shown in black.")
Number of cases per country.
covid_cum %>%
ggplot(aes(tp, cum_cases)) +
geom_line(aes(group = country, color = country %in% top_10_cases)) +
geom_dl(aes(label = geoid, color = country %in% top_10_cases), method = list("last.points", cex = 0.8)) +
scale_y_log10() +
scale_color_manual(values = c("gray", "black")) +
theme(legend.position = "none") +
labs(x = "Days since first case in each country",
y = "Cumulative number of cases",
caption = "Each line represents one country.\nTop 10 is shown in black.")
Number of deaths per country.
covid_cum %>%
ggplot(aes(tp, cum_deaths)) +
geom_line(aes(group = country, color = country %in% top_10_deaths)) +
geom_dl(aes(label = geoid, color = country %in% top_10_deaths), method = list("last.points", cex = 0.8)) +
scale_y_log10() +
scale_color_manual(values = c("gray", "black")) +
theme(legend.position = "none") +
labs(x = "Days since first death in each country",
y = "Cumulative number of deaths",
caption = "Each line represents one country.\nTop 10 is shown in black.")
covid_cum %>%
ggplot(aes(tp, risk)) +
geom_line(aes(group = country)) +
geom_dl(aes(label = geoid), method = list("last.points", cex = 0.8)) +
scale_y_log10() +
labs(x = "Time, days from first case",
y = "Risk of death",
caption = "Each line represents one country")
Lagged (7 days) risk. Risk of death relative to number of cases 7 days earlier.
covid_cum %>%
ggplot(aes(tp, risk_lag)) +
geom_line(aes(group = country), size = 0.2) +
geom_dl(aes(label = geoid), method = list("last.points", cex = 0.8)) +
scale_y_log10() +
labs(x = "Time, days from first case",
y = "Risk of death",
caption = "Each line represents one country")
Relative number of cases and deaths per 100,000 population
rolling_sum <- rollify(sum, window = 2)
Fill in missing dates to calculate 14-day rolling sum.
rolling_sums <- covid_cum %>%
group_by(country) %>%
complete(daterep = seq.Date(min(daterep), max(daterep), "week"),
fill = list(cases = 0, deaths = 0),
geoid,
popdata) %>%
mutate(nobs = n()) %>%
filter(nobs > 14) %>%
mutate(cases14 = rolling_sum(cases),
deaths14 = rolling_sum(deaths),
cases14_100k = (cases14 / popdata) * 1e5,
deaths14_100k = (deaths14 / popdata) * 1e5) %>%
select(country, daterep, geoid, popdata, ends_with("14"), ends_with("14_100k")) %>%
na.omit()
europe <- rolling_sums %>%
filter(gsub("_", " ", country) %in% c(eu$country, "Norway", "Russia"))
ranks <- europe %>%
group_by(country) %>%
mutate(
rank_cases = cases14_100k[daterep == max(daterep)],
rank_deaths = deaths14_100k[daterep == max(daterep)]) %>%
ungroup()
ranks %>%
mutate(country = reorder(country, 1 / rank_cases)) %>%
ggplot(aes(daterep, cases14_100k)) +
geom_line(aes(group = country)) +
facet_wrap(~ country, scales = "free_y") +
labs(x = "Date",
y = "14-day rolling cases\nper 100,000 population")
ranks %>%
mutate(country = reorder(country, 1 / rank_deaths)) %>%
ggplot(aes(daterep, deaths14_100k)) +
geom_line(aes(group = country)) +
facet_wrap(~ country, scales = "free_y") +
labs(x = "Date",
y = "14-day rolling deaths\nper 100,000 population")
rolling_sums %>%
filter(gsub("_", " ", country) %in% c(eu$country, "Norway", "Russia")) %>%
mutate(risk = deaths14 / lag(cases14, 1),
country = reorder(country, 1 / risk)) %>%
group_by(country) %>%
mutate(
rank_risk = risk[daterep == max(daterep)]) %>%
ungroup() %>%
mutate(country = reorder(country, 1 / rank_risk)) %>%
ggplot(aes(daterep, risk)) +
geom_line(aes(group = country)) +
facet_wrap(~ country) +
scale_y_log10() +
labs(x = "Date",
y = "Risk of death")
est_raw <- read_csv(here("data/opendata_covid19_test_results.csv"))
est <- est_raw %>%
mutate(result_wk = isoweek(ResultTime),
ResultDate = date(ResultTime)) %>%
filter(yday(ResultDate) < yday(today()) - 1)
- aasta rahvaarv Statistikaameti andmebaasist.
rahvaarv <- 1328976
14 day rolling number of cases.
rolling_sum <- rollify(sum, window = 14)
est_cov_sum <- est %>%
count(ResultDate, ResultValue) %>%
group_by(ResultValue) %>%
complete(ResultDate = seq.Date(min(ResultDate), max(ResultDate), "day"), fill = list(n = 0)) %>%
group_by(ResultValue) %>%
mutate(n14 = rolling_sum(n)) %>%
drop_na()
est_cov_sum %>%
ggplot() +
geom_line(aes(ResultDate, n14)) +
facet_wrap(~ ResultValue, scales = "free_y") +
labs(x = "Date, 2020",
y = "Number of tests")
est_cov_sum_100k <- est_cov_sum %>%
filter(ResultValue == "P") %>%
mutate(n14_100k = (n14 / rahvaarv) * 100000)
est_cov_sum_100k %>%
ggplot(aes(ResultDate, n14_100k)) +
geom_line() +
geom_dl(label = prettyNum(est_cov_sum_100k$n14_100k[length(est_cov_sum_100k$n14_100k)], digits = 3), method = list("last.points", cex = 0.8)) +
labs(x = "Date, 2020",
y = "14 day cases per 100'000 population")
Percent of positive cases per week.
est %>%
count(result_wk, ResultValue) %>%
pivot_wider(names_from = ResultValue, values_from = n) %>%
mutate(tests = N + P,
pp = P / tests) %>%
na.omit() %>%
ggplot() +
geom_point(aes(result_wk, pp, size = tests)) +
scale_y_continuous(labels = scales::percent) +
labs(x = "Week of 2020",
y = "Positive tests, %")
We are looking at the test result reporting and publishing timestamps by Estonian Healthboard.
When are the analyses performed and reported during the day. Be extra careful with interpretations!
daytime <- function(x) {
s <- as.character(second(x))
if (nchar(s) == 1) {
s <- paste0(s, s)
}
paste0(hour(x), ":", minute(x), ":", s) %>%
hms::parse_hms()
}
processing <- est %>%
mutate(result_to_insert = interval(ResultTime, AnalysisInsertTime) / dhours(1),
result_time = daytime(ResultTime),
insert_time = daytime(AnalysisInsertTime)) %>%
filter(result_to_insert > 0) %>%
group_by(date(ResultTime)) %>%
summarise_at("result_to_insert", median)
Results timestamps during day.
processing %>%
filter(`date(ResultTime)` > "2020-03-15") %>%
ggplot(aes(`date(ResultTime)`, result_to_insert)) +
geom_line() +
scale_y_log10() +
labs(x = "Result week, 2020", y = "Result to db insert, h")
Timestamps of result insertion to database.
est %>%
mutate(insert_time = daytime(AnalysisInsertTime)) %>%
ggplot() +
geom_histogram(aes(x = insert_time, y = ..count.. / sum(..count..)), bins = 24) +
scale_y_continuous(labels = scales::percent) +
labs(x = "Result database insertion time", y = "Percent cases")
Time from test result to database insertion.
processing %>%
ggplot() +
geom_histogram(aes(x = result_to_insert, y = ..count.. / sum(..count..)), bins = 24) +
scale_y_continuous(labels = scales::percent) +
scale_x_log10(labels = formatC) +
labs(x = "Timespan from test result to database insertion, hours",
y = "Percent cases")