Skip to content

Commit 697d61b

Browse files
author
Martin Benes
committed
fixed deprecated pandas calls, fixed new columns added
1 parent 421ae5e commit 697d61b

File tree

8 files changed

+218
-162
lines changed

8 files changed

+218
-162
lines changed

covid19dh/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"""Unified data hub for a better understanding of COVID-19.
33
44
For more information check README.md.
5-
5+
66
Reference: https://covid19datahub.io/
77
Todo:
88
* caching
@@ -13,5 +13,5 @@
1313

1414
try:
1515
__version__ = pkg_resources.get_distribution("covid19dh").version
16-
except:
16+
except Exception:
1717
__version__ = None

covid19dh/_cache.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11

22
# ======== data cache =========
3-
_cache = {} # data
3+
_cache = {} # data
4+
45

56
def _construct_cache_id(level, dt, raw, vintage):
67
cache_id = f"{level}"
@@ -10,32 +11,41 @@ def _construct_cache_id(level, dt, raw, vintage):
1011
cache_id += dt.strftime("%Y-%m-%d")
1112
return cache_id
1213

14+
1315
def read_cache(level, dt, raw, vintage):
1416
cache_id = _construct_cache_id(level=level, dt=dt, raw=raw, vintage=vintage)
1517
try:
1618
return _cache[cache_id]
17-
except:
19+
except Exception:
1820
return None
21+
1922
def write_cache(x, level, dt, raw, vintage):
2023
cache_id = _construct_cache_id(level=level, dt=dt, raw=raw, vintage=vintage)
2124
_cache[cache_id] = x
2225

26+
2327
# ========= src cache ==========
24-
_cache_src = {} # src
28+
_cache_src = {} # src
29+
30+
2531
def _construct_src_cache_id(dt, vintage):
2632
cache_id = "src"
2733
if vintage:
2834
cache_id += dt.strftime("%Y-%m-%d")
2935
return cache_id
30-
36+
37+
3138
def read_src_cache(dt, vintage):
3239
cache_id = _construct_src_cache_id(dt=dt, vintage=vintage)
3340
try:
3441
return _cache_src[cache_id]
35-
except:
42+
except Exception:
3643
return None
44+
45+
3746
def write_src_cache(src, dt, vintage):
3847
cache_id = _construct_src_cache_id(dt=dt, vintage=vintage)
3948
_cache_src[cache_id] = src
40-
41-
__all__ = ["read_cache", "write_cache", "read_src_cache", "write_src_cache"]
49+
50+
51+
__all__ = ["read_cache", "write_cache", "read_src_cache", "write_src_cache"]

covid19dh/_cite.py

Lines changed: 42 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -7,27 +7,29 @@
77
import pandas as pd
88
import requests
99

10+
1011
def get_sources():
1112
url = 'https://storage.covid19datahub.io/src.csv'
1213
response = requests.get(url) # headers={'User-Agent': 'Mozilla/5.0'}
1314
return pd.read_csv( StringIO(response.text))
1415

16+
1517
def sources_to_citations(sources):
1618
# shorten URL
1719
sources.url = sources.url.apply(
1820
lambda u: re.sub(
1921
r"(http://|https://|www\\.)([^/]+)(.*)",
2022
r"\1\2/",
21-
u )
23+
u)
2224
)
2325
# remove duplicit
24-
unique_references = sources.groupby(["title","author","institution","url","textVersion","bibtype"])
25-
26+
unique_references = sources.groupby(["title","author","institution","url","textVersion","bibtype"])
27+
2628
# format
2729
citations = []
28-
for n,g in unique_references:
30+
for n, g in unique_references:
2931
for i in range(1):
30-
(title,author,institution,url,textVersion,bibtype) = n
32+
(title, author, institution, url, textVersion, bibtype) = n
3133
year = g.year.max()
3234

3335
if textVersion:
@@ -55,53 +57,57 @@ def sources_to_citations(sources):
5557
else:
5658
post += "."
5759
citation = f"{pre} ({year}), {post}"
58-
60+
5961
citations.append(citation)
6062
return citations
6163

62-
def cite(x, verbose = True, sources = None):
64+
65+
def cite(x: pd.DataFrame, verbose: bool = True, sources: bool = None):
6366
# all sources if missing
6467
if sources is None:
6568
sources = get_sources()
66-
69+
6770
# per iso
6871
references = pd.DataFrame(data=None, columns=sources.columns)
69-
for iso,country in x.groupby(["iso_alpha_3"]):
72+
for (iso,), country in x.groupby(["iso_alpha_3"]):
7073
# levels
7174
level = country.administrative_area_level.unique()[0]
7275
# empty attributes
7376
empty_params = country.apply(lambda c: c.isnull().all() | (c == 0).all())
7477
params = x.columns[~empty_params]
75-
78+
7679
# filter
7780
src = sources[
78-
(sources.administrative_area_level == level) & # level
81+
(sources.administrative_area_level == level) & # level
7982
(sources.iso_alpha_3 == iso) & # iso
8083
sources.data_type.isin(params) # data type
8184
]
8285
# fallback for missing
8386
missing = set(params) - set(src.data_type.unique())
8487
if missing:
85-
src = src.append(sources[
86-
sources.data_type.isin(missing) & # data type
87-
sources.iso_alpha_3.isnull() & # empty ISO
88-
sources.administrative_area_level.isnull() # empty level
88+
src = pd.concat([
89+
src,
90+
sources[
91+
sources.data_type.isin(missing) & # data type
92+
sources.iso_alpha_3.isnull() & # empty ISO
93+
sources.administrative_area_level.isnull() # empty level
94+
]
8995
])
90-
96+
9197
# set iso,level
9298
src.iso_alpha_3 = iso
9399
src.administrative_area_level = level
94-
100+
95101
# join
96-
references = references.append(src)
97-
98-
references.drop_duplicates(inplace = True)
99-
102+
references = pd.concat([references, src])
103+
104+
references.drop_duplicates(inplace=True)
105+
100106
return references
101-
102-
103-
104-
107+
108+
109+
110+
105111
# ===
106112
# hash data stats
107113
params = set(x.columns)
@@ -112,36 +118,37 @@ def cite(x, verbose = True, sources = None):
112118
sources["iso_alpha_3"].isin(isos) &
113119
sources["data_type"].isin(params) ]
114120
sources = sources.fillna("")
115-
121+
116122
# filter
117123
def is_source_used(ref):
118124
# data type not present
119125
if not ref['data_type'] in params: return False
120126
# fallbacks
121127
if not ref['iso_alpha_3'] or not ref['administrative_area_level']: return True
122-
128+
123129
# check both equal
124130
return ((x.iso_alpha_3 == ref.iso_alpha_3) & (x.administrative_area_level == ref.administrative_area_level)).any()
125-
131+
126132
sources = sources[sources.apply(is_source_used, axis=1)]
127-
133+
128134
# drop fallback
129135
for p in params:
130136
non_fallback = (sources.data_type == p) & (sources.iso_alpha_3 != '')
131137
no_data = (x[p].isnull() | (x[p] == 0))
132138
fallback = (sources.data_type == p) & (sources.iso_alpha_3 == '')
133139
if non_fallback.any() or no_data.all():
134140
sources.drop(fallback.index[fallback].tolist(), inplace=True)
135-
141+
136142
#citations = sources_to_citations(sources)
137-
143+
138144
#if verbose:
139145
# print("\033[1mData References:\033[0m\n", end="")
140146
# for ref in citations:
141147
# print("\t" + ref, end="\n\n")
142148
# print("\033[33mTo hide the data sources use 'verbose = False'.\033[0m")
143-
144-
sources.replace(r'^\s*$', math.nan, regex=True, inplace=True)
149+
150+
sources.replace(r'^\s*$', math.nan, regex=True, inplace=True)
145151
return sources
146-
147-
__all__ = ["cite","get_sources"]
152+
153+
154+
__all__ = ["cite", "get_sources"]

0 commit comments

Comments
 (0)