77import pandas as pd
88import requests
99
10+
1011def get_sources ():
1112 url = 'https://storage.covid19datahub.io/src.csv'
1213 response = requests .get (url ) # headers={'User-Agent': 'Mozilla/5.0'}
1314 return pd .read_csv ( StringIO (response .text ))
1415
16+
1517def sources_to_citations (sources ):
1618 # shorten URL
1719 sources .url = sources .url .apply (
1820 lambda u : re .sub (
1921 r"(http://|https://|www\\.)([^/]+)(.*)" ,
2022 r"\1\2/" ,
21- u )
23+ u )
2224 )
2325 # remove duplicit
24- unique_references = sources .groupby (["title" ,"author" ,"institution" ,"url" ,"textVersion" ,"bibtype" ])
25-
26+ unique_references = sources .groupby (["title" ,"author" ,"institution" ,"url" ,"textVersion" ,"bibtype" ])
27+
2628 # format
2729 citations = []
28- for n ,g in unique_references :
30+ for n , g in unique_references :
2931 for i in range (1 ):
30- (title ,author ,institution ,url ,textVersion ,bibtype ) = n
32+ (title , author , institution , url , textVersion , bibtype ) = n
3133 year = g .year .max ()
3234
3335 if textVersion :
@@ -55,53 +57,57 @@ def sources_to_citations(sources):
5557 else :
5658 post += "."
5759 citation = f"{ pre } ({ year } ), { post } "
58-
60+
5961 citations .append (citation )
6062 return citations
6163
62- def cite (x , verbose = True , sources = None ):
64+
65+ def cite (x : pd .DataFrame , verbose : bool = True , sources : bool = None ):
6366 # all sources if missing
6467 if sources is None :
6568 sources = get_sources ()
66-
69+
6770 # per iso
6871 references = pd .DataFrame (data = None , columns = sources .columns )
69- for iso ,country in x .groupby (["iso_alpha_3" ]):
72+ for ( iso ,), country in x .groupby (["iso_alpha_3" ]):
7073 # levels
7174 level = country .administrative_area_level .unique ()[0 ]
7275 # empty attributes
7376 empty_params = country .apply (lambda c : c .isnull ().all () | (c == 0 ).all ())
7477 params = x .columns [~ empty_params ]
75-
78+
7679 # filter
7780 src = sources [
78- (sources .administrative_area_level == level ) & # level
81+ (sources .administrative_area_level == level ) & # level
7982 (sources .iso_alpha_3 == iso ) & # iso
8083 sources .data_type .isin (params ) # data type
8184 ]
8285 # fallback for missing
8386 missing = set (params ) - set (src .data_type .unique ())
8487 if missing :
85- src = src .append (sources [
86- sources .data_type .isin (missing ) & # data type
87- sources .iso_alpha_3 .isnull () & # empty ISO
88- sources .administrative_area_level .isnull () # empty level
88+ src = pd .concat ([
89+ src ,
90+ sources [
91+ sources .data_type .isin (missing ) & # data type
92+ sources .iso_alpha_3 .isnull () & # empty ISO
93+ sources .administrative_area_level .isnull () # empty level
94+ ]
8995 ])
90-
96+
9197 # set iso,level
9298 src .iso_alpha_3 = iso
9399 src .administrative_area_level = level
94-
100+
95101 # join
96- references = references . append ( src )
97-
98- references .drop_duplicates (inplace = True )
99-
102+ references = pd . concat ([ references , src ] )
103+
104+ references .drop_duplicates (inplace = True )
105+
100106 return references
101-
102-
103-
104-
107+
108+
109+
110+
105111 # ===
106112 # hash data stats
107113 params = set (x .columns )
@@ -112,36 +118,37 @@ def cite(x, verbose = True, sources = None):
112118 sources ["iso_alpha_3" ].isin (isos ) &
113119 sources ["data_type" ].isin (params ) ]
114120 sources = sources .fillna ("" )
115-
121+
116122 # filter
117123 def is_source_used (ref ):
118124 # data type not present
119125 if not ref ['data_type' ] in params : return False
120126 # fallbacks
121127 if not ref ['iso_alpha_3' ] or not ref ['administrative_area_level' ]: return True
122-
128+
123129 # check both equal
124130 return ((x .iso_alpha_3 == ref .iso_alpha_3 ) & (x .administrative_area_level == ref .administrative_area_level )).any ()
125-
131+
126132 sources = sources [sources .apply (is_source_used , axis = 1 )]
127-
133+
128134 # drop fallback
129135 for p in params :
130136 non_fallback = (sources .data_type == p ) & (sources .iso_alpha_3 != '' )
131137 no_data = (x [p ].isnull () | (x [p ] == 0 ))
132138 fallback = (sources .data_type == p ) & (sources .iso_alpha_3 == '' )
133139 if non_fallback .any () or no_data .all ():
134140 sources .drop (fallback .index [fallback ].tolist (), inplace = True )
135-
141+
136142 #citations = sources_to_citations(sources)
137-
143+
138144 #if verbose:
139145 # print("\033[1mData References:\033[0m\n", end="")
140146 # for ref in citations:
141147 # print("\t" + ref, end="\n\n")
142148 # print("\033[33mTo hide the data sources use 'verbose = False'.\033[0m")
143-
144- sources .replace (r'^\s*$' , math .nan , regex = True , inplace = True )
149+
150+ sources .replace (r'^\s*$' , math .nan , regex = True , inplace = True )
145151 return sources
146-
147- __all__ = ["cite" ,"get_sources" ]
152+
153+
154+ __all__ = ["cite" , "get_sources" ]
0 commit comments