Visualizing corona case data with altair#
Norway has stopped tracking and reporting data about the ongoing covid pandemic, but we can view data from 2020-2022.
import io
from functools import lru_cache
import altair as alt
import matplotlib.pyplot as plt
import pandas as pd
import requests
data_url = "https://raw.githubusercontent.com/thohan88/covid19-nor-data/HEAD/data"
@lru_cache()
def download_dataset(path):
"""Download a dataset from covid19-nor-data archive"""
url = f"{data_url}/{path}"
print(f"Downloading {url}")
r = requests.get(f"{data_url}/{path}")
r.raise_for_status()
print(f"Downloaded {len(r.content) // 1024}kB")
if path.endswith(".csv"):
read = pd.read_csv
elif path.endswith(".xlsx"):
read = pd.read_excel
return read(io.BytesIO(r.content), parse_dates=["date"])
all_cases = download_dataset("01_infected/msis/municipality_and_district.csv")
all_cases
Downloading https://raw.githubusercontent.com/thohan88/covid19-nor-data/HEAD/data/01_infected/msis/municipality_and_district.csv
Downloaded 31065kB
date | date_time | kommune_bydel_no | kommune_bydel_name | bydel_no | bydel_name | kommune_no | kommune_name | fylke_no | fylke_name | population | cases | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020-03-26 | 2020-03-26T04:00:00Z | 30101 | Gamle Oslo | 30101.0 | Gamle Oslo | 301 | Oslo | 3 | Oslo | 58671 | 67 |
1 | 2020-03-26 | 2020-03-26T04:00:00Z | 30102 | Grünerløkka | 30102.0 | Grünerløkka | 301 | Oslo | 3 | Oslo | 62423 | 59 |
2 | 2020-03-26 | 2020-03-26T04:00:00Z | 30103 | Sagene | 30103.0 | Sagene | 301 | Oslo | 3 | Oslo | 45089 | 47 |
3 | 2020-03-26 | 2020-03-26T04:00:00Z | 30104 | St. Hanshaugen | 30104.0 | St. Hanshaugen | 301 | Oslo | 3 | Oslo | 38945 | 33 |
4 | 2020-03-26 | 2020-03-26T04:00:00Z | 30105 | Frogner | 30105.0 | Frogner | 301 | Oslo | 3 | Oslo | 59269 | 67 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
377250 | 2022-12-07 | 2022-12-07T04:00:00Z | 5442 | Nesseby | NaN | NaN | 5442 | Nesseby | 54 | Troms og Finnmark | 926 | 88 |
377251 | 2022-12-07 | 2022-12-07T04:00:00Z | 5443 | Båtsfjord | NaN | NaN | 5443 | Båtsfjord | 54 | Troms og Finnmark | 2221 | 447 |
377252 | 2022-12-07 | 2022-12-07T04:00:00Z | 5444 | Sør-Varanger | NaN | NaN | 5444 | Sør-Varanger | 54 | Troms og Finnmark | 10158 | 1601 |
377253 | 2022-12-07 | 2022-12-07T04:00:00Z | 9998 | Svalbard | NaN | NaN | 9998 | Svalbard | 99 | Svalbard | 0 | 0 |
377254 | 2022-12-07 | 2022-12-07T04:00:00Z | 9999 | Ukjent Kommune | NaN | NaN | 9999 | Ukjent Kommune | 99 | Ukjent Fylke | 0 | 19722 |
377255 rows × 12 columns
# aggregate data by fylke
cases = (
all_cases.groupby(["fylke_name", "date"])[["cases", "population"]]
.sum()
.reset_index()
).copy()
cases
fylke_name | date | cases | population | |
---|---|---|---|---|
0 | Agder | 2020-03-26 | 133 | 307231 |
1 | Agder | 2020-03-27 | 148 | 307231 |
2 | Agder | 2020-03-28 | 159 | 307231 |
3 | Agder | 2020-03-29 | 161 | 307231 |
4 | Agder | 2020-03-30 | 166 | 307231 |
... | ... | ... | ... | ... |
12800 | Viken | 2022-12-03 | 393395 | 1241165 |
12801 | Viken | 2022-12-04 | 393399 | 1241165 |
12802 | Viken | 2022-12-05 | 393410 | 1241165 |
12803 | Viken | 2022-12-06 | 393463 | 1241165 |
12804 | Viken | 2022-12-07 | 393518 | 1241165 |
12805 rows × 4 columns
# discard ukjent fylke where population 100k doesn't make sense
cases = cases[~cases.fylke_name.str.contains("Ukjent")].copy()
cases
fylke_name | date | cases | population | |
---|---|---|---|---|
0 | Agder | 2020-03-26 | 133 | 307231 |
1 | Agder | 2020-03-27 | 148 | 307231 |
2 | Agder | 2020-03-28 | 159 | 307231 |
3 | Agder | 2020-03-29 | 161 | 307231 |
4 | Agder | 2020-03-30 | 166 | 307231 |
... | ... | ... | ... | ... |
12800 | Viken | 2022-12-03 | 393395 | 1241165 |
12801 | Viken | 2022-12-04 | 393399 | 1241165 |
12802 | Viken | 2022-12-05 | 393410 | 1241165 |
12803 | Viken | 2022-12-06 | 393463 | 1241165 |
12804 | Viken | 2022-12-07 | 393518 | 1241165 |
11820 rows × 4 columns
# 'cases' is a cumulative sum
# reverse that to calculate the daily new case count
# 1. createe a new column
cases["daily cases"] = 0
# 2. populate it per fylke
for fylke in cases.fylke_name.unique():
mask = cases.fylke_name == fylke
fylke_cases = cases.loc[mask]
diff = fylke_cases.cases.diff()
# set first value from cases
diff.iloc[0] = fylke_cases.iloc[0].cases
cases.loc[fylke_cases.index, "daily cases"] = diff.astype(int)
cases
fylke_name | date | cases | population | daily cases | |
---|---|---|---|---|---|
0 | Agder | 2020-03-26 | 133 | 307231 | 133 |
1 | Agder | 2020-03-27 | 148 | 307231 | 15 |
2 | Agder | 2020-03-28 | 159 | 307231 | 11 |
3 | Agder | 2020-03-29 | 161 | 307231 | 2 |
4 | Agder | 2020-03-30 | 166 | 307231 | 5 |
... | ... | ... | ... | ... | ... |
12800 | Viken | 2022-12-03 | 393395 | 1241165 | 47 |
12801 | Viken | 2022-12-04 | 393399 | 1241165 | 4 |
12802 | Viken | 2022-12-05 | 393410 | 1241165 | 11 |
12803 | Viken | 2022-12-06 | 393463 | 1241165 | 53 |
12804 | Viken | 2022-12-07 | 393518 | 1241165 | 55 |
11820 rows × 5 columns
Add per-100,000 population column#
# per100k is "daily new cases per 100k population"
cases["per100k"] = cases["daily cases"] * 1e5 / (cases["population"] + 1)
cases
fylke_name | date | cases | population | daily cases | per100k | |
---|---|---|---|---|---|---|
0 | Agder | 2020-03-26 | 133 | 307231 | 133 | 43.289761 |
1 | Agder | 2020-03-27 | 148 | 307231 | 15 | 4.882304 |
2 | Agder | 2020-03-28 | 159 | 307231 | 11 | 3.580356 |
3 | Agder | 2020-03-29 | 161 | 307231 | 2 | 0.650974 |
4 | Agder | 2020-03-30 | 166 | 307231 | 5 | 1.627435 |
... | ... | ... | ... | ... | ... | ... |
12800 | Viken | 2022-12-03 | 393395 | 1241165 | 47 | 3.786762 |
12801 | Viken | 2022-12-04 | 393399 | 1241165 | 4 | 0.322278 |
12802 | Viken | 2022-12-05 | 393410 | 1241165 | 11 | 0.886263 |
12803 | Viken | 2022-12-06 | 393463 | 1241165 | 53 | 4.270178 |
12804 | Viken | 2022-12-07 | 393518 | 1241165 | 55 | 4.431317 |
11820 rows × 6 columns
Exercise#
plot cases per 1000 by fylke over time
latest_cases = cases[-5000:]
latest_cases.columns
Index(['fylke_name', 'date', 'cases', 'population', 'daily cases', 'per100k'], dtype='object')
subset = cases[cases.fylke_name.isin(["Oslo", "Viken"])]
daily = (
alt.Chart(subset)
.mark_point()
.encode(
x="date:T",
y="per100k",
color="fylke_name",
tooltip=[
"fylke_name",
"population",
"per100k",
"cases",
],
)
)
daily
weekly = (
alt.Chart(subset)
.mark_line()
.transform_window(
weekly_avg="mean(per100k)",
frame=[-3, 3],
)
.encode(
x="date:T",
y="weekly_avg:Q",
color="fylke_name",
tooltip=[
"fylke_name",
"population",
"per100k",
"cases",
"date",
],
)
)
weekly
weekly | daily
We can also do the same with matplotlib
subset.groupby("fylke_name")[["date", "per100k"]].plot(
x="date", y="per100k", legend=True
)
fylke_name
Oslo Axes(0.125,0.2;0.775x0.68)
Viken Axes(0.125,0.2;0.775x0.68)
dtype: object
subset.set_index("date").groupby("fylke_name").per100k.plot(legend=True)
fylke_name
Oslo Axes(0.125,0.2;0.775x0.68)
Viken Axes(0.125,0.2;0.775x0.68)
Name: per100k, dtype: object
admissions = pd.read_csv(
"pasienter-innlagt-i-syke.csv",
parse_dates=["Dato"],
dayfirst=True,
)
admissions
Dato | Kumulativt antall | Nye sykehusinnlegelser | |
---|---|---|---|
0 | 2020-02-21 | 0 | 0 |
1 | 2020-02-22 | 0 | 0 |
2 | 2020-02-23 | 0 | 0 |
3 | 2020-02-24 | 0 | 0 |
4 | 2020-02-25 | 0 | 0 |
... | ... | ... | ... |
973 | 2022-10-21 | 16348 | 13 |
974 | 2022-10-22 | 16359 | 11 |
975 | 2022-10-23 | 16374 | 15 |
976 | 2022-10-24 | 16389 | 15 |
977 | 2022-10-25 | 16390 | 1 |
978 rows × 3 columns
alt.Chart(admissions).mark_line().encode(
x="Dato",
y="Nye sykehusinnlegelser:Q",
)
monthly_hospital = (
alt.Chart(admissions)
.mark_line()
.transform_window(
frame=[-30, 0],
admissions="mean(Nye sykehusinnlegelser)",
)
.encode(
x="Dato",
y="admissions:Q",
)
)
alt.vconcat(monthly_hospital, weekly)
weekly