-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathchart_average_reporting_delay
executable file
·119 lines (111 loc) · 5.27 KB
/
chart_average_reporting_delay
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/python3
import sys, datetime, os, math
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import scipy.stats as stats
from scipy.optimize import curve_fit
data = {}
def parse_date(s):
return datetime.datetime.strptime(s[:8], '%Y%m%d').date()
def record(as_of, date_of_death, deaths):
if date_of_death not in data:
data[date_of_death] = []
data[date_of_death].append((as_of, deaths))
def dump():
for date_of_death in sorted(data.keys()):
print(f'{date_of_death}', end='')
for (as_of, deaths) in data[date_of_death]:
print(f' {as_of}:{deaths}', end='')
print('')
def chart():
(fig, ax) = plt.subplots(dpi=300, figsize=(10, 5))
right = 60
xy_fit = []
for date_of_death in sorted(data.keys()):
final_deaths = data[date_of_death][-1][1]
x_days_past_date = [(date - date_of_death).days for (date, deaths) in data[date_of_death]]
y_deaths = [100 * deaths / final_deaths for (date, deaths) in data[date_of_death]]
# save points for curve fitting
for (date, deaths) in data[date_of_death]:
# ignore points above 100%
if deaths / final_deaths <= 1:
xy_fit.append(((date - date_of_death).days, deaths / final_deaths))
ax.plot(x_days_past_date, y_deaths, color='black', linewidth=0.3, alpha=0.5)
# fit into exponential function
def func_exp(x, a):
return 1 - np.exp(-a * x)
popt, pcov = curve_fit(func_exp, [x[0] for x in xy_fit], [x[1] for x in xy_fit])
open('curve_fit.txt', 'w').write(f'{popt[0]:.4f}\n')
x2 = np.linspace(0, right)
ax.plot(x2, 100 * func_exp(x2, *popt), color='red', alpha=0.5, linewidth=3,
label=f'Exponential distribution CDF = $1-e^{{-{popt[0]:.4f}x}}$')
# calculates nr. of days to reach given fraction of reported deaths
for frac in (.5, .85, .95):
days = np.log(1 - frac) / -popt[0]
ax.annotate(f'{round(frac * 100)}% of deaths reported within {days:.1f} days',
xy=(days, 100 * frac), xycoords='data', textcoords='offset points', xytext=(50,-50),
arrowprops=dict(facecolor='black', width=1, headwidth=5, headlength=5), ha='left')
# configure chart
ax.grid(True, which='minor', linewidth=0.2, color='pink')
ax.grid(True, which='major', linewidth=0.5, color='pink')
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(True)
ax.spines['left'].set_visible(True)
ax.spines['right'].set_visible(False)
ax.set_ylabel('Percentage of deaths reported')
ax.set_xlabel('Days after date of death')
ax.xaxis.set_minor_locator(ticker.MultipleLocator(base=1))
ax.xaxis.set_major_locator(ticker.MultipleLocator(base=5))
ax.yaxis.set_minor_locator(ticker.MultipleLocator(base=5))
ax.yaxis.set_major_locator(ticker.MultipleLocator(base=10))
ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%g%%'))
ax.set_xlim(left=0, right=right)
ax.set_ylim(bottom=0)
ax.legend(loc='lower right')
ax.text(
-0.05, -0.14,
'Each curve represents progress by the Florida Department of Health to report all deaths that occur on a given day (Florida_COVID_19_Deaths_by_Day)\n'
'Source: https://github.com/mbevand/florida-covid19-deaths-by-day Created by: Marc Bevand — @zorinaq',
transform=ax.transAxes, fontsize='x-small', verticalalignment='top',
)
fig.suptitle(f'Florida COVID-19 deaths by day\n(N={len(data)} dates of death)')
fig.savefig('chart_average_reporting_delay.png', bbox_inches='tight')
plt.close()
def main():
for f in sorted(os.listdir('.')):
if not f.endswith('.csv'):
continue
df = pd.read_csv(f)
# Parse date out of "deathsbydateexport_20200623.csv"
as_of = parse_date(f.split('_')[1])
found_date_of_death_for_same_day = False
for _, row in df.iterrows():
date_of_death = datetime.datetime.utcfromtimestamp(row['Date'] / 1000).date()
if str(date_of_death) < '2020-01-01':
continue
record(as_of, date_of_death, int(row['Deaths']))
found_date_of_death_for_same_day |= (as_of == date_of_death)
if not found_date_of_death_for_same_day:
# if the CSV file has no deaths recorded for the date of death the CSV file was published,
# it means it is zero
record(as_of, as_of, 0)
for date_of_death in list(data.keys()):
data[date_of_death].sort()
# Eliminate the dates of death that:
# - have less than 15 data points as the progression of death reporting through time
# is not very meaningful (this typically excludes the last 14 days), or
# - starts at more than 95% complete (this typically excludes early days from March/April
# as the daily archives of deaths by day data don't go far back enough to see the
# progression of death reporting for these early days)
if len(data[date_of_death]) < 15:
#print(f'short {date_of_death}')
del data[date_of_death]
elif data[date_of_death][0][1] / data[date_of_death][-1][1] > 0.95:
#print(f'limit {date_of_death}')
del data[date_of_death]
#dump()
chart()
if __name__ == '__main__':
main()