diff --git a/Dockerfile b/Dockerfile index 7a7492c..08ce2dc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,7 +11,7 @@ ENV PYTHON_VERSION=${PYTHON_VERSION} # install required software and programmes for development environment RUN apt-get update RUN apt-get install -y apt-utils vim curl wget unzip tree htop adduser -RUN apt-get install -y imagemagick=8:7.1.1.43+dfsg1-1+deb13u5 libssl-dev=3.5.4-1~deb13u2 +RUN apt-get install -y imagemagick=8:7.1.1.43+dfsg1-1+deb13u5 libssl-dev=3.5.4-1~deb13u2 libpq-dev=17.8-0+deb13u1 linux-libc-dev=6.12.69-1 # set up home environment RUN adduser ${user} diff --git a/config/uv/RandomTelecomPayments.cmd b/config/uv/RandomTelecomPayments.cmd index 3137807..ba34c70 100644 --- a/config/uv/RandomTelecomPayments.cmd +++ b/config/uv/RandomTelecomPayments.cmd @@ -1,2 +1,2 @@ :: call powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex" -call uv add -r requirements.txt --link-mode=copy \ No newline at end of file +call uv add -r ..\..\requirements.txt --link-mode=copy \ No newline at end of file diff --git a/generator/cons.py b/generator/cons.py index eefee25..0f25067 100644 --- a/generator/cons.py +++ b/generator/cons.py @@ -14,7 +14,7 @@ # set data files fpath_randomtelecomtransdata = os.path.join(subdir_data,'RandomTelecomPayments.csv') fpath_randomtelecomusersdata = os.path.join(subdir_data,'RandomTelecomUsers.parquet') -fpath_arch_randomtelecomdata = os.path.join(subdir_data, 'arch', 'RandomTelecomPayments.csv') +fpath_arch_randomtelecomdata = os.path.join(subdir_data, 'arch', 'RandomTelecomPaymentsV0.3.csv') fpath_temp_llama_first_names = os.path.join(subdir_data, 'temp', 'llama_first_names_{country}.csv') fpath_temp_llama_last_names = os.path.join(subdir_data, 'temp', 'llama_last_names_{country}.csv') fpath_temp_llama_email_domains = os.path.join(subdir_data, 'temp', 'llama_email_domains_{country}.csv') diff --git a/generator/main.py b/generator/main.py index 769812a..36dde7b 100644 --- a/generator/main.py +++ b/generator/main.py @@ -11,6 +11,7 @@ from utilities.input_error_handling import input_error_handling from utilities.multiprocess import multiprocess from app.gen_random_telecom_data import gen_random_telecom_data +import qa def main(input_params_dict: dict): """ @@ -56,10 +57,6 @@ def main(input_params_dict: dict): # order results by userid and transaction date ascending user_data = user_data.sort_values(by = 'uid').reset_index(drop = True) trans_data = trans_data.sort_values(by = 'transaction_date').reset_index(drop = True) - # end timer - t1 = time() - total_runtime_seconds = round(t1 - t0, 2) - logging.info(f'Total Runtime: {total_runtime_seconds} seconds') # print out head and shape of data logging.info(f'RandomTeleComUsersData.shape: {user_data.shape}') logging.info(f'RandomTeleComTransData.shape: {trans_data.shape}') @@ -73,6 +70,15 @@ def main(input_params_dict: dict): logging.info(f'Writing output trans level random telecoms data to: {cons.fpath_randomtelecomtransdata}') user_data.to_parquet(cons.fpath_randomtelecomusersdata, engine='fastparquet') trans_data.to_csv(cons.fpath_randomtelecomtransdata, index = False) + # initialise QA objects + qa.Uids(data=trans_data).run_all() + qa.Transactions(data=trans_data).run_all() + qa.Cards(data=trans_data).run_all() + qa.Ips(data=trans_data).run_all() + # end timer + t1 = time() + total_runtime_seconds = round(t1 - t0, 2) + logging.info(f'Total Runtime: {total_runtime_seconds} seconds') # return dataframes as dictionary return {"user_data": user_data, "trans_data": trans_data} diff --git a/generator/qa/Cards.py b/generator/qa/Cards.py new file mode 100644 index 0000000..5a58cdd --- /dev/null +++ b/generator/qa/Cards.py @@ -0,0 +1,51 @@ +import logging +import seaborn as sns + +class Cards(): + + def __init__(self, data, show_logs=False, show_plots=False): + """ + """ + logging.info("Initialising Cards QA") + self.data = data + self.show_logs = show_logs + self.show_plots = show_plots + + def unique_card_types(self): + """ + """ + nunique_card_types_per_card = self.data.groupby(by=['card_hash'], as_index=False, dropna=False).agg({'card_type':'nunique'}) + card_type_payment_totals = self.data.groupby(by=['card_type', 'transaction_payment_method'], as_index=False, dropna=False).agg({'transaction_hash':'size', 'transaction_amount':'sum'}) + # test assertions + assert nunique_card_types_per_card['card_type'].max() == 1 + assert nunique_card_types_per_card['card_type'].min() == 0 + assert not nunique_card_types_per_card['card_type'].isnull().any() + if self.show_plots: + # plot distribution + sns.histplot(data=nunique_card_types_per_card,x='card_type', bins = 20) + if self.show_logs: + # show logs + logging.info(card_type_payment_totals) + + def unique_country_codes(self): + """ + """ + nunique_country_codes_per_card = self.data.groupby(by=['card_hash'], as_index=False, dropna=False).agg({'card_country_code':'nunique'}) + card_country_code_payment_totals = self.data.assign(card_country_code=self.data['card_country_code'].notnull().astype(int)).groupby(by=['card_country_code', 'transaction_payment_method'], as_index=False, dropna=False).agg({'transaction_hash':'size', 'transaction_amount':'sum'}) + # test assertions + assert nunique_country_codes_per_card['card_country_code'].max() == 1 + assert nunique_country_codes_per_card['card_country_code'].min() == 0 + assert not nunique_country_codes_per_card['card_country_code'].isnull().any() + if self.show_plots: + # plot distribution + sns.histplot(data=nunique_country_codes_per_card,x='card_country_code', bins = 20) + if self.show_logs: + # show logs + logging.info(card_country_code_payment_totals) + + def run_all(self): + """ + """ + self.unique_card_types() + self.unique_country_codes() + logging.info("All Cards QA checks passed.") \ No newline at end of file diff --git a/generator/qa/Ips.py b/generator/qa/Ips.py new file mode 100644 index 0000000..a2b4acf --- /dev/null +++ b/generator/qa/Ips.py @@ -0,0 +1,34 @@ +import logging +import seaborn as sns + +class Ips(): + + def __init__(self, data, show_logs=False, show_plots=False): + """ + """ + logging.info("Initialising Ips QA") + self.data = data + self.show_logs = show_logs + self.show_plots = show_plots + + def unique_country_codes(self): + """ + """ + nunique_country_codes_per_ip = self.data.groupby(by=['ip_hash'], as_index=False, dropna=False).agg({'ip_country_code':'nunique'}) + ip_country_codes_totals = self.data.groupby(by=['ip_country_code'], as_index=False, dropna=False).size() + # test assertions + assert nunique_country_codes_per_ip['ip_country_code'].max() == 1 + assert nunique_country_codes_per_ip['ip_country_code'].min() == 1 + assert not nunique_country_codes_per_ip['ip_country_code'].isnull().any() + if self.show_plots: + # plot distribution + sns.histplot(data=nunique_country_codes_per_ip,x='ip_country_code', bins = 20) + if self.show_logs: + # show logs + logging.info(ip_country_codes_totals.to_markdown()) + + def run_all(self): + """ + """ + self.unique_country_codes() + logging.info("All Ips QA checks passed.") \ No newline at end of file diff --git a/generator/qa/Transactions.py b/generator/qa/Transactions.py new file mode 100644 index 0000000..9e82c77 --- /dev/null +++ b/generator/qa/Transactions.py @@ -0,0 +1,193 @@ +import logging +import seaborn as sns +import pandas as pd + +class Transactions(): + + def __init__(self, data, show_logs=False, show_plots=False): + """ + """ + logging.info("Initialising Transactions QA") + self.data = data + self.show_logs = show_logs + self.show_plots = show_plots + + def unique_trans_hash(self): + """ + """ + unique_trans_hash_cnt = self.data['transaction_hash'].value_counts().sort_values(ascending=False) + # test assertions + assert unique_trans_hash_cnt.max() == 1 + assert unique_trans_hash_cnt.min() == 1 + assert not unique_trans_hash_cnt.isnull().any() + if self.show_plots: + # plot distribution + sns.histplot(data=unique_trans_hash_cnt.to_frame(),x='count', bins = 10) + + def unique_dates(self): + """ + """ + nunique_trans_dates_per_trans = self.data.groupby(by=['transaction_hash'], dropna=False, as_index=False).agg({'transaction_date':'nunique'}) + # test assertions + assert nunique_trans_dates_per_trans['transaction_date'].max() == 1 + assert nunique_trans_dates_per_trans['transaction_date'].min() == 1 + assert not nunique_trans_dates_per_trans['transaction_date'].isnull().any() + assert bool((pd.to_datetime(self.data['registration_date']) <= pd.to_datetime(self.data['transaction_date'])).all()) + if self.show_plots: + # plot distribution + sns.histplot(data=nunique_trans_dates_per_trans,x='transaction_date', bins = 20) + + def unique_trans_amount(self): + """ + """ + nunique_trans_amounts_per_trans = self.data.groupby(by=['transaction_hash'], dropna=False, as_index=False).agg({'transaction_amount':'nunique'}) + # test assertions + assert nunique_trans_amounts_per_trans['transaction_amount'].max() == 1 + assert nunique_trans_amounts_per_trans['transaction_amount'].min() == 1 + assert not nunique_trans_amounts_per_trans['transaction_amount'].isnull().any() + if self.show_plots: + # plot distribution + sns.histplot(data=nunique_trans_amounts_per_trans,x='transaction_amount', bins = 20) + + def unique_payment_method(self): + """ + """ + nunique_payment_method_per_trans = self.data.groupby(by=['transaction_hash'], dropna=False, as_index=False).agg({'transaction_payment_method':'nunique'}).sort_values('transaction_payment_method') + unique_payment_method_per_trans_hash = self.data.groupby(by=['card_type', 'transaction_payment_method'], as_index=False, dropna=False).agg({'transaction_hash':'size', 'transaction_amount':'sum'}) + # test assertions + assert nunique_payment_method_per_trans['transaction_payment_method'].max() == 1 + assert nunique_payment_method_per_trans['transaction_payment_method'].min() == 0 + assert not nunique_payment_method_per_trans['transaction_payment_method'].isnull().any() + assert unique_payment_method_per_trans_hash['card_type'].isnull().any() + assert unique_payment_method_per_trans_hash['card_type'].dropna().isin(['Mastercard', 'Visa']).all() + assert unique_payment_method_per_trans_hash['transaction_payment_method'].isnull().any() + assert unique_payment_method_per_trans_hash['transaction_payment_method'].dropna().isin(['Card', 'Points', 'Wallet']).all() + assert (unique_payment_method_per_trans_hash.loc[unique_payment_method_per_trans_hash['transaction_payment_method'].isnull(), 'transaction_amount'] == 0).all() + assert (unique_payment_method_per_trans_hash.loc[unique_payment_method_per_trans_hash['transaction_payment_method'].notnull(), 'transaction_amount'] > 0).all() + assert unique_payment_method_per_trans_hash.loc[unique_payment_method_per_trans_hash['transaction_payment_method'] == 'Card', 'card_type'].isin(['Mastercard', 'Visa']).all() + assert unique_payment_method_per_trans_hash.loc[unique_payment_method_per_trans_hash['transaction_payment_method'] != 'Card', 'card_type'].isnull().all() + if self.show_plots: + # plot distribution + sns.histplot(data=nunique_payment_method_per_trans,x='transaction_payment_method', bins = 20) + if self.show_logs: + # show logs + logging.info(unique_payment_method_per_trans_hash.to_markdown()) + + def unique_payment_channel(self): + """ + """ + nunique_payment_channel_per_trans = self.data.groupby(by=['transaction_hash'], as_index=False).agg({'card_payment_channel':'nunique'}, dropna=False).sort_values('card_payment_channel') + unique_payment_channel_per_trans_hash = self.data.groupby(by=['transaction_payment_method', 'card_payment_channel'], as_index=False, dropna=False).agg({'transaction_hash':'size', 'transaction_amount':'sum'}) + # test assertions + assert nunique_payment_channel_per_trans['card_payment_channel'].max() == 1 + assert nunique_payment_channel_per_trans['card_payment_channel'].min() == 0 + assert not nunique_payment_channel_per_trans['card_payment_channel'].isnull().any() + assert unique_payment_channel_per_trans_hash['transaction_payment_method'].isnull().any() + assert unique_payment_channel_per_trans_hash['transaction_payment_method'].dropna().isin(['Card', 'Points', 'Wallet']).all() + assert unique_payment_channel_per_trans_hash['card_payment_channel'].isnull().any() + assert unique_payment_channel_per_trans_hash['card_payment_channel'].dropna().isin(['Adyen', 'AppStore', 'Docomo', 'PayPal', 'WorldPay']).all() + assert (unique_payment_channel_per_trans_hash.loc[unique_payment_channel_per_trans_hash['transaction_payment_method'].isnull(), 'transaction_amount'] == 0).all() + assert (unique_payment_channel_per_trans_hash.loc[unique_payment_channel_per_trans_hash['transaction_payment_method'].notnull(), 'transaction_amount'] > 0).all() + assert unique_payment_channel_per_trans_hash.loc[unique_payment_channel_per_trans_hash['transaction_payment_method'] == 'Card', 'card_payment_channel'].isin(['Adyen', 'AppStore', 'Docomo', 'PayPal', 'WorldPay']).all() + assert unique_payment_channel_per_trans_hash.loc[unique_payment_channel_per_trans_hash['transaction_payment_method'] != 'Card', 'card_payment_channel'].isnull().all() + if self.show_plots: + # plot distribution + sns.histplot(data=nunique_payment_channel_per_trans,x='card_payment_channel', bins = 20) + if self.show_logs: + # show logs + logging.info(unique_payment_channel_per_trans_hash.to_markdown()) + + def unique_trans_status(self): + """ + """ + nunique_trans_status_per_trans = self.data.groupby(by=['transaction_hash'], as_index=False).agg({'transaction_status':'nunique'}) + unique_trans_status_per_trans_hash = self.data.groupby(by=['transaction_error_code', 'transaction_status'], as_index=False, dropna=False).size() + # test assertions + assert nunique_trans_status_per_trans['transaction_status'].max() == 1 + assert nunique_trans_status_per_trans['transaction_status'].min() == 1 + assert not nunique_trans_status_per_trans['transaction_status'].isnull().any() + assert unique_trans_status_per_trans_hash['transaction_error_code'].isnull().any() + assert unique_trans_status_per_trans_hash['transaction_error_code'].dropna().isin(['E900:ConnectionTimeout', 'E901:SuspectedFraud', 'E902:AuthenicationFailure', 'E903:UserCancelled', 'E904:InsufficientFunds']).all() + assert unique_trans_status_per_trans_hash['transaction_status'].notnull().all() + assert unique_trans_status_per_trans_hash['transaction_status'].dropna().isin(['Rejected', 'Pending', 'Successful']).all() + assert unique_trans_status_per_trans_hash.loc[unique_trans_status_per_trans_hash['transaction_error_code'].notnull(), 'transaction_status'].isin(['Rejected']).all() + assert unique_trans_status_per_trans_hash.loc[unique_trans_status_per_trans_hash['transaction_error_code'].isnull(), 'transaction_status'].isin(['Pending', 'Successful']).all() + if self.show_plots: + # plot distribution + sns.histplot(data=nunique_trans_status_per_trans,x='transaction_status', bins = 20) + if self.show_logs: + # show logs + logging.info(unique_trans_status_per_trans_hash.to_markdown()) + + def unique_error_codes(self): + """ + """ + nunique_errorcodes_per_trans = self.data.groupby(by=['transaction_hash']).agg({'transaction_error_code':'nunique'}, dropna=False, as_index=False) + unique_error_codes_statuses_per_trans_hash = self.data.groupby(by=['transaction_error_code', 'transaction_status'], as_index=False, dropna=False).size() + # test assertions + assert nunique_errorcodes_per_trans['transaction_error_code'].max() == 1 + assert nunique_errorcodes_per_trans['transaction_error_code'].min() == 0 + assert not nunique_errorcodes_per_trans['transaction_error_code'].isnull().any() + assert unique_error_codes_statuses_per_trans_hash['transaction_error_code'].isnull().any() + assert unique_error_codes_statuses_per_trans_hash['transaction_error_code'].dropna().isin(['E900:ConnectionTimeout', 'E901:SuspectedFraud', 'E902:AuthenicationFailure', 'E903:UserCancelled', 'E904:InsufficientFunds']).all() + assert unique_error_codes_statuses_per_trans_hash['transaction_status'].notnull().all() + assert unique_error_codes_statuses_per_trans_hash['transaction_status'].dropna().isin(['Rejected', 'Pending', 'Successful']).all() + assert unique_error_codes_statuses_per_trans_hash.loc[unique_error_codes_statuses_per_trans_hash['transaction_error_code'].notnull(), 'transaction_status'].isin(['Rejected']).all() + assert unique_error_codes_statuses_per_trans_hash.loc[unique_error_codes_statuses_per_trans_hash['transaction_error_code'].isnull(), 'transaction_status'].isin(['Pending', 'Successful']).all() + if self.show_plots: + # plot distribution + sns.histplot(data=nunique_errorcodes_per_trans,x='transaction_error_code', bins = 20) + if self.show_logs: + # show logs + logging.info(unique_error_codes_statuses_per_trans_hash.to_markdown()) + + def uid_max_device_trans_error_counts(self): + """ + """ + nunique_devices_per_uid = self.data.groupby(by='uid', as_index=False).agg({'device_hash':'nunique'}).sort_values(by='device_hash') + uids_max_devices = self.data.loc[self.data['uid'].isin(nunique_devices_per_uid['uid'].tail()), :].sort_values(by=['uid', 'device_hash', 'transaction_date']) + uids_with_high_device_hash_counts = uids_max_devices.groupby(by=['userid'], as_index=False).agg({'device_hash':'nunique', 'transaction_hash':'count', 'transaction_error_code':'count'}) + uids_with_high_device_hash_counts_error_codes = uids_max_devices.groupby(by=['transaction_error_code'], as_index=False).size().sort_values(by='size', ascending=False) + if self.show_logs: + # show logs + logging.info(uids_with_high_device_hash_counts.to_markdown()) + logging.info(uids_with_high_device_hash_counts_error_codes.to_markdown()) + + def uid_max_card_trans_error_counts(self): + """ + """ + nunique_cards_per_uid = self.data.groupby(by='uid', as_index=False).agg({'card_hash':'nunique'}).sort_values(by='card_hash') + uids_max_cards = self.data.loc[self.data['uid'].isin(nunique_cards_per_uid['uid'].tail()), :].sort_values(by=['uid', 'card_hash', 'transaction_date']) + uids_with_high_card_hash_counts = uids_max_cards.groupby(by=['userid'], as_index=False).agg({'card_hash':'nunique', 'transaction_hash':'count', 'transaction_error_code':'count'}) + uids_with_high_card_hash_counts_error_codes = uids_max_cards.groupby(by=['transaction_error_code'], as_index=False).size().sort_values(by='size', ascending=False) + if self.show_logs: + # show logs + logging.info(uids_with_high_card_hash_counts.to_markdown()) + logging.info(uids_with_high_card_hash_counts_error_codes.to_markdown()) + + def uid_max_ip_trans_error_counts(self): + """ + """ + nunique_ips_per_uid = self.data.groupby(by='uid', as_index=False, dropna=False).agg({'ip_hash':'nunique'}).sort_values(by='ip_hash') + uids_max_ips = self.data.loc[self.data['uid'].isin(nunique_ips_per_uid['uid'].tail()), :].sort_values(by=['uid', 'ip_hash', 'transaction_date']) + uids_with_high_ip_hash_counts = uids_max_ips.groupby(by=['userid'], as_index=False).agg({'ip_hash':'nunique', 'transaction_hash':'count', 'transaction_error_code':'count'}) + uids_with_high_ip_hash_counts_error_codes = uids_max_ips.groupby(by=['transaction_error_code'], as_index=False).size().sort_values(by='size', ascending=False) + if self.show_logs: + # show logs + logging.info(uids_with_high_ip_hash_counts.to_markdown()) + logging.info(uids_with_high_ip_hash_counts_error_codes.to_markdown()) + + def run_all(self): + """ + """ + self.unique_trans_hash() + self.unique_dates() + self.unique_trans_amount() + self.unique_payment_method() + self.unique_payment_channel() + self.unique_trans_status() + self.unique_error_codes() + self.uid_max_device_trans_error_counts() + self.uid_max_card_trans_error_counts() + self.uid_max_ip_trans_error_counts() + logging.info("All Transactions QA checks passed.") \ No newline at end of file diff --git a/generator/qa/Uids.py b/generator/qa/Uids.py new file mode 100644 index 0000000..32b653d --- /dev/null +++ b/generator/qa/Uids.py @@ -0,0 +1,157 @@ +import logging +import seaborn as sns +import pandas as pd + +class Uids(): + + def __init__(self, data, show_logs=False, show_plots=False): + """ + """ + logging.info("Initialising Uids QA") + self.data = data + self.show_logs = show_logs + self.show_plots = show_plots + + def unique_user_ids(self): + """ + """ + nunique_user_ids_per_uid = self.data.groupby(by='uid', dropna=False, as_index=False).agg({'userid':'nunique'}).sort_values(by=['userid']) + # test assertions + assert nunique_user_ids_per_uid['userid'].max() == 1 + assert nunique_user_ids_per_uid['userid'].min() == 1 + assert not self.data['userid'].isnull().any() + if self.show_plots: + # plot distribution + sns.histplot(data=nunique_user_ids_per_uid,x='userid', bins = 20) + + def unique_names(self): + """ + """ + tmp_data = self.data.copy() + tmp_data['fullname'] = tmp_data['first_name'] + ' ' + tmp_data['last_name'] + nunique_names_per_uid = tmp_data.groupby(['userid'], dropna=False, as_index=False).agg({'fullname':'nunique'}).sort_values(by=['fullname']) + # test assertions + assert nunique_names_per_uid['fullname'].max() <= 3 + assert nunique_names_per_uid['fullname'].min() == 1 + assert not tmp_data['fullname'].isnull().any() + if self.show_plots: + # plot distribution + sns.histplot(data=nunique_names_per_uid,x='fullname', bins = 20) + + def unique_reg_dates(self): + """ + """ + nunique_reg_date_per_uid = self.data.groupby(by='uid', dropna=False, as_index=False).agg({'registration_date':'nunique'}).sort_values(by=['registration_date']) + # test assertions + assert nunique_reg_date_per_uid['registration_date'].max() == 1 + assert nunique_reg_date_per_uid['registration_date'].min() == 1 + assert not self.data['registration_date'].isnull().any() + assert bool((pd.to_datetime(self.data['registration_date']) <= pd.to_datetime(self.data['transaction_date'])).all()) + if self.show_plots: + # plot distribution + sns.histplot(data=nunique_reg_date_per_uid,x='registration_date', bins = 20) + + def nunique_reg_countries(self): + """ + """ + nunique_reg_country_per_uid = self.data.groupby(by='uid', dropna=False, as_index=False).agg({'registration_country_code':'nunique'}).sort_values(by=['registration_country_code']) + # test assertions + assert nunique_reg_country_per_uid['registration_country_code'].max() == 1 + assert nunique_reg_country_per_uid['registration_country_code'].min() == 1 + assert not self.data['registration_country_code'].isnull().any() + if self.show_plots: + # plot distribution + sns.histplot(data=nunique_reg_country_per_uid,x='registration_country_code', bins = 20) + + def unique_email_domains(self): + """ + """ + nunique_email_domains_per_uid = self.data.groupby(by='uid', dropna=False, as_index=False).agg({'email_domain':'nunique'}).sort_values(by=['email_domain']) + # test assertions + assert nunique_email_domains_per_uid['email_domain'].max() == 1 + assert nunique_email_domains_per_uid['email_domain'].min() == 1 + assert not self.data['email_domain'].isnull().any() + if self.show_plots: + # plot distribution + sns.histplot(data=nunique_email_domains_per_uid,x='email_domain', bins = 20) + + def unique_devices(self): + """ + """ + nunique_devices_per_uid = self.data.groupby(by='uid', dropna=False, as_index=False).agg({'device_hash':'nunique'}).sort_values(by=['device_hash']) + # test assertions + assert nunique_devices_per_uid['device_hash'].max() <= 25 + assert nunique_devices_per_uid['device_hash'].min() == 1 + assert not self.data['device_hash'].isnull().any() + if self.show_plots: + # plot distribution + sns.histplot(data=nunique_devices_per_uid,x='device_hash', bins = 20) + + def unique_cards(self): + """ + """ + nunique_cards_per_uid = self.data.groupby(by='uid', dropna=False, as_index=False).agg({'card_hash':'nunique'}).sort_values(by=['card_hash']) + # test assertions + assert nunique_cards_per_uid['card_hash'].max() <= 20 + assert nunique_cards_per_uid['card_hash'].min() == 0 + assert self.data['card_hash'].isnull().any() + if self.show_plots: + # plot distribution + sns.histplot(data=nunique_cards_per_uid,x='card_hash', bins = 20) + + def unique_ips(self): + """ + """ + nunique_ips_per_uid = self.data.groupby(by='uid', dropna=False, as_index=False).agg({'ip_hash':'nunique'}).sort_values(by=['ip_hash']) + # test assertions + assert nunique_ips_per_uid['ip_hash'].max() <= 45 + assert nunique_ips_per_uid['ip_hash'].min() == 1 + assert not self.data['ip_hash'].isnull().any() + if self.show_plots: + # plot distribution + sns.histplot(data=nunique_ips_per_uid,x='ip_hash', bins = 10) + + def unique_apps(self): + """ + """ + nunique_apps_per_uid = self.data.groupby(by='uid', dropna=False, as_index=False).agg({'application_hash':'nunique'}).sort_values(by=['application_hash']) + # test assertions + assert nunique_apps_per_uid['application_hash'].max() <= 50 + assert nunique_apps_per_uid['application_hash'].min() == 1 + assert not self.data['application_hash'].isnull().any() + if self.show_plots: + # plot distribution + sns.histplot(data=nunique_apps_per_uid,x='application_hash', bins = 10) + + def unique_transactions(self): + """ + """ + nunique_ips_per_uid = self.data.groupby(by='uid', dropna=False, as_index=False).agg({'transaction_hash':'nunique'}).sort_values(by=['transaction_hash']) + transaction_payment_rel = self.data.assign(transaction_hash=self.data['transaction_hash'].notnull().astype(int)).groupby(by=['transaction_hash', 'transaction_payment_method'], as_index=False, dropna=False).agg({'transaction_amount':'sum'}) + assert transaction_payment_rel['transaction_hash'].max() <= 300 + assert transaction_payment_rel['transaction_hash'].min() == 1 + assert not self.data['transaction_hash'].isnull().any() + assert transaction_payment_rel.loc[transaction_payment_rel['transaction_payment_method'] == "Card", "transaction_amount"].iloc[0] > 0 + assert transaction_payment_rel.loc[transaction_payment_rel['transaction_payment_method'] == "Points", "transaction_amount"].iloc[0] > 0 + assert transaction_payment_rel.loc[transaction_payment_rel['transaction_payment_method'] == "Wallet", "transaction_amount"].iloc[0] > 0 + assert transaction_payment_rel.loc[transaction_payment_rel['transaction_payment_method'].isnull(), "transaction_amount"].iloc[0] == 0 + if self.show_plots: + # plot distribution + sns.histplot(data=nunique_ips_per_uid,x='transaction_hash', bins=10) + if self.show_logs: + logging.info(transaction_payment_rel.to_markdown()) + + def run_all(self): + """ + """ + self.unique_user_ids() + self.unique_names() + self.unique_reg_dates() + self.nunique_reg_countries() + self.unique_email_domains() + self.unique_devices() + self.unique_cards() + self.unique_ips() + self.unique_apps() + self.unique_transactions() + logging.info("All Uids QA checks passed.") \ No newline at end of file diff --git a/generator/qa/__init__.py b/generator/qa/__init__.py new file mode 100644 index 0000000..1fe3fa8 --- /dev/null +++ b/generator/qa/__init__.py @@ -0,0 +1,4 @@ +from .Uids import Uids +from .Transactions import Transactions +from .Cards import Cards +from .Ips import Ips \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 1c46a04..666e23b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,8 @@ dependencies = [ "fastapi[standard]==0.128.0", "fastparquet==2024.11.0", "ipykernel==6.29.5", + "nbclient==0.10.4", + "nbformat==5.10.4", "networkx==3.4.2", "numpy==2.0.2", "pandas==2.2.3", @@ -19,5 +21,6 @@ dependencies = [ "scipy==1.15.1", "seaborn==0.13.2", "shap==0.46.0", + "tabulate==0.9.0", "unidecode==1.3.8", ] diff --git a/report/customer_value_score.qmd b/report/customer_value_score.qmd deleted file mode 100644 index 0147f4f..0000000 --- a/report/customer_value_score.qmd +++ /dev/null @@ -1,95 +0,0 @@ - -```{python} -#| label: set-up -import os -import numpy as np -import pandas as pd -import os -import sys - -sys.path.append(os.getcwd()) -sys.path.append(os.path.dirname(os.getcwd())) - -import generator.cons as cons -``` - -# Data Load - -```{python} -#| label: data-load -# load user feature data -user_feat_foath = os.path.join('..', 'data', 'report', 'user_feat_data.csv') -feat_data = pd.read_csv(user_feat_foath) -feat_data.head() -``` - -# Customer Value Score - -Percentile score / rank the users across each week number based on their successful transaction count, and amount total. - -```{python} -#| label: score-data -def week_pct_score(group, score_cols = ['successful_size','successful_sum']): - """ - """ - # percentile rank the score columns - group_score = group[score_cols].rank(method='average', ascending=True, pct=True, axis=0) - group_score.columns = group_score.columns + '_pct' - # join score results back to groups - group_results = group.join(group_score) - return group_results - -def gen_weekly_user_scores(group): - """ - """ - # define score and id columns - id_cols = ['userid', 'transaction_week'] - score_cols=['successful_size_pct', 'successful_sum_pct'] - value_cols = ['customer_value_score'] - # calcualte the customer value score - group['customer_value_score'] = group[score_cols].mean(axis=1) - return group[id_cols+score_cols+value_cols] - -# only conder users who made at least one transaction in any given week -score_data_week = feat_data.loc[feat_data['successful_size'] > 0, :].copy() -# score each user across each week for their percentile score in number of successfull transactions counts and acounts -score_data_week = score_data_week.groupby(by=['transaction_week'], group_keys=False).apply(lambda group: week_pct_score(group)) -# group by each user and apply a cumulative sum to determine weekly values scores over time -score_data_week = score_data_week.groupby(by=['userid'], group_keys=False).apply(lambda group: gen_weekly_user_scores(group)) -score_data_week.head(10) -``` - -# Create Base Value Score Dataset - -```{python} -#| label: base-data -def apply_cumsum(group): - """ - """ - # define score and id columns - id_cols = ['userid', 'transaction_week'] - score_cols=['successful_size_pct', 'successful_sum_pct'] - value_cols = ['customer_value_score', 'customer_value_score_cumsum'] - # sort and apply cumsum - group_sort = group.sort_values(by='transaction_week') - group_sort['customer_value_score_cumsum'] = group_sort['customer_value_score'].cumsum() - return group_sort[id_cols+score_cols+value_cols] - -# create base data -base_user_data = feat_data[['userid']].drop_duplicates().reset_index(drop=True).assign(key = 1).sort_values(by='userid') -base_transweek_data = feat_data[['transaction_week']].drop_duplicates().reset_index(drop=True).assign(key = 1).sort_values(by='transaction_week') -base_data = pd.merge(left=base_user_data, right=base_transweek_data, on='key', how='inner').drop(columns=['key']) -base_score_data = pd.merge(left=base_data, right=score_data_week, on=['userid','transaction_week'], how='left') -# fill missing pct and score values as 0 -base_score_data = base_score_data.fillna(0) -# apply cumulative sum to generate value scores cumulative increase over time -base_score_data = base_score_data.groupby(by=['userid'], group_keys=False).apply(lambda group: apply_cumsum(group)) -``` - -# Write Data to Disk - -```{python} -#| label: write data -user_score_data_fpath=os.path.join('..', 'data', 'report', 'customer_value_score.csv') -base_score_data.to_csv(user_score_data_fpath, index=False) -``` diff --git a/report/entity_count_dists.qmd b/report/entity_count_dists.qmd deleted file mode 100644 index 0ab63bf..0000000 --- a/report/entity_count_dists.qmd +++ /dev/null @@ -1,88 +0,0 @@ -# Entity Count Distributions - -```{python} -#| label: set-up -import sys -import os -import matplotlib.pyplot as plt -import seaborn as sns - -sys.path.append(os.getcwd()) -sys.path.append(os.path.dirname(os.getcwd())) - -import generator.cons as cons -from generator.utilities.gen_random_poisson_power import gen_random_poisson_power - -data_model_poisson_params = { - 'user':{'lambda':20, 'power':1}, - 'device':{'lambda':0.2, 'power':2}, - 'card':{'lambda':0.1, 'power':2}, - 'ip':{'lambda':1.3, 'power':2}, - 'application':{'lambda':1, 'power':2}, - 'transaction':{'lambda':5, 'power':2} - } - -def distplot(data, bins='auto'): - """ - """ - # set plot size and style - sns.set(rc={'figure.figsize':(7, 7), "lines.linewidth": 0.7}) - sns.set_style("white") - displot = sns.histplot(data=data, bins=bins) -``` - -# Device - -The count of used devices per userid. - -```{python} -#| label: device-count-per-userid -device_poisson_params = data_model_poisson_params['device'] -device_cnts_per_user = gen_random_poisson_power(lam=device_poisson_params['lambda'], size=1000, power=device_poisson_params['power']) -distplot(data=device_cnts_per_user) - -``` - -# Card - -The count of used cards per userid. - -```{python} -#| label: card-count-per-userid -card_poisson_params = data_model_poisson_params['card'] -card_cnts_per_user = gen_random_poisson_power(lam=card_poisson_params['lambda'], size=1000, power=card_poisson_params['power']) -distplot(data=card_cnts_per_user) -``` - -# IP - -The count of used IPs per userid. - -```{python} -#| label: ip-count-per-userid -ip_poisson_params = data_model_poisson_params['ip'] -ip_cnts_per_user = gen_random_poisson_power(lam=ip_poisson_params['lambda'], size=1000, power=ip_poisson_params['power']) -distplot(data=ip_cnts_per_user) -``` - -# Application - -The count of applications per userid. - -```{python} -#| label: application-count-per-userid -app_poisson_params = data_model_poisson_params['application'] -app_cnts_per_user = gen_random_poisson_power(lam=app_poisson_params['lambda'], size=1000, power=app_poisson_params['power']) -distplot(data=app_cnts_per_user) -``` - -# Transaction - -The count of transactions per userid. - -```{python} -#| label: transaction-count-per-userid -trans_poisson_params = data_model_poisson_params['transaction'] -trans_cnts_per_user = gen_random_poisson_power(lam=trans_poisson_params['lambda'], size=1000, power=trans_poisson_params['power']) -distplot(data=trans_cnts_per_user) -``` \ No newline at end of file diff --git a/report/exe_quarto_reports.cmd b/report/exe_quarto_reports.cmd new file mode 100644 index 0000000..d3200eb --- /dev/null +++ b/report/exe_quarto_reports.cmd @@ -0,0 +1,8 @@ +call uv run quarto render qmarkdown/qa.qmd +call uv run quarto render qmarkdown/entity_count_dists.qmd +call uv run quarto render qmarkdown/feature_engineering.qmd +call uv run quarto render qmarkdown/customer_value_score.qmd +call uv run quarto render qmarkdown/white_list_users.qmd +call uv run quarto render qmarkdown/network.qmd +call uv run quarto render qmarkdown/isolation_forests.qmd +call uv run quarto render qmarkdown/risk_score.qmd \ No newline at end of file diff --git a/report/isolation_forests.qmd b/report/isolation_forests.qmd deleted file mode 100644 index 2a0eb09..0000000 --- a/report/isolation_forests.qmd +++ /dev/null @@ -1,136 +0,0 @@ - -```{python} -#| label: set-up -import os -import numpy as np -import pandas as pd -import sklearn as sk -import shap -import os -import sys -from datetime import datetime -from sklearn.ensemble import IsolationForest -import pickle - -sys.path.append(os.getcwd()) -sys.path.append(os.path.dirname(os.getcwd())) - -import generator.cons as cons -``` - -# Data Load - -```{python} -#| label: data-load -# load user feature data -user_feat_foath = os.path.join('..', 'data', 'report', 'user_feat_data.csv') -feat_data = pd.read_csv(user_feat_foath) - -# load network data -comp_data_fpath=os.path.join('..', 'data', 'report', 'user_comp_data.csv') -comp_data = pd.read_csv(comp_data_fpath) - -# join feature and component data -model_data = pd.merge(left=feat_data, right=comp_data, on=['userid', 'transaction_week'], how='left').fillna(0) -# order model date by transaction week and userids -model_data = model_data.sort_values(by=['transaction_week', 'userid']).reset_index(drop=True) -``` - -# Isolation Forests Model - -```{python} - -class IsolationForestsModel(): - def __init__(self, n_estimators=20, random_state=None, warm_start=False, n_jobs=None): - self.model = IsolationForest(n_estimators=n_estimators, random_state=random_state, warm_start=warm_start) - self.n_estimators = n_estimators - def fit(self, X): - if self.model.warm_start: - self.model.n_estimators += self.n_estimators - self.model = self.model.fit(X) - return self - def decision_function(self, X): - return self.model.decision_function(X) - def write(self, model_fpath): - with open(model_fpath,'wb') as f: - pickle.dump(self,f) - def read(self, model_fpath): - with open(model_fpath, 'rb') as f: - return pickle.load(f) - -# initiate isolation forest model -model = IsolationForestsModel(n_estimators=5, random_state=None, n_jobs=2, warm_start=True) - -if False: - # write load to disk - model_fpath = os.path.join('..', 'data', 'report', 'isolation_forests_model.pickle') - model.write(model_fpath) -``` - -# Score Data - -```{python} -#| label: score-data - -def apply_isolation_forests(group, model_fpath): - """ - """ - # initiate and load isolation forest model - model = IsolationForestsModel() - model = model.read(model_fpath) - # split data - id_cols = ['userid', 'transaction_week'] - X_cols = ['E901_size', 'E901_sum', 'E902_size', 'E902_sum', 'n_comps', 'total_comp_size'] - train_group = group[X_cols] - score_group = group[id_cols+X_cols] - # train isolation forests and score data - model = model.fit(train_group) - score_group['score'] = model.decision_function(train_group) - # write model to disk - model.write(model_fpath) - return score_group - -def gen_anomaly_score(group): - """ - """ - group_sort = group.sort_values('transaction_week') - group_sort['anomaly_score'] = group_sort['score'].cumsum() - return group_sort - -if False: - # apply isolation forests model across each transaction week - score_data = model_data.groupby(by=['transaction_week'], group_keys=False, as_index=False).apply(lambda group: apply_isolation_forests(group, model_fpath=model_fpath)) - # generate anomaly score - anomaly_data = score_data.groupby(by=['userid'], group_keys=False, as_index=False).apply(lambda group: gen_anomaly_score(group)) - # sort data by most anaomalious - anomaly_data = anomaly_data.sort_values(by=['userid', 'transaction_week']).reset_index(drop=True) - anomaly_data['black_list_users'] = (anomaly_data['anomaly_score'] < 0).astype(int) -``` - -# Write Anomalious Data - -```{python} -#| label: write-data -# save user anaomly data to disk -score_data_users_fpath=os.path.join('..', 'data', 'report', 'user_anomaly_data.csv') -anomaly_data.to_csv(score_data_users_fpath, index=False) -``` - -# Evaluate Model - -```{python} -# initiate and load isolation forest model -model = IsolationForestsModel() -model_fpath = os.path.join('..', 'data', 'report', 'isolation_forests_model.pickle') -model = model.read(model_fpath) -# load scored data -score_data_users_fpath=os.path.join('..', 'data', 'report', 'user_anomaly_data.csv') -anomaly_data = pd.read_csv(score_data_users_fpath) -eval_data = anomaly_data.loc[anomaly_data['transaction_week'] == 52, :].copy() -# generate shap beeswarm plot -pred = (eval_data['anomaly_score'] < 0).astype(int) -explainer = shap.TreeExplainer(model.model) -X_cols = ['E901_size', 'E901_sum', 'E902_size', 'E902_sum', 'n_comps', 'total_comp_size'] -explanation = explainer(eval_data[X_cols]) -shap.plots.beeswarm(explanation) -``` diff --git a/report/network.qmd b/report/network.qmd deleted file mode 100644 index 6ce9c97..0000000 --- a/report/network.qmd +++ /dev/null @@ -1,153 +0,0 @@ - -```{python} -#| label: set-up -import os -import numpy as np -import pandas as pd -import sklearn as sk -import networkx as nx -import matplotlib.pyplot as plt -import shap -import os -import sys -from datetime import datetime -from sklearn.ensemble import IsolationForest - -sys.path.append(os.getcwd()) -sys.path.append(os.path.dirname(os.getcwd())) - -import generator.cons as cons -``` - -# Load Random Telecom Payments Data - -```{python} -#| label: data-load -# load random telecom payments data -data_fpath=os.path.join('..', 'data', 'arch', 'RandomTelecomPaymentsV1.1.csv') -parse_dates = ['registration_date', 'transaction_date'] -date_parser = lambda x: datetime.strptime(x, '%Y-%m-%d') -data = pd.read_csv(filepath_or_buffer=data_fpath, parse_dates=parse_dates) - -# determine the week number for all transaction dates -data['transaction_week'] = data['transaction_date'].dt.isocalendar().week -``` - -# Data Preparation - -```{python} -#| label: data-preparation - -def gen_entity_network_data(data, entity, userid = 'userid', trans_week = 'transaction_week'): - """ - """ - # extract out the unique userids and device hashes - user_entity_data = data[[userid, entity, trans_week]].dropna().drop_duplicates() - # inner join users to users based on shared device hash - user_entity_network_data = pd.merge(left = user_entity_data, right = user_entity_data, on = [entity, trans_week], how = 'inner') - # drop rows where userid_x = userid_y - user_entity_network_data = user_entity_network_data.loc[user_entity_network_data[f'{userid}_x'] != user_entity_network_data[f'{userid}_y'], :] - # set col order - col_order = [f'{userid}_x', f'{userid}_y', entity, trans_week] - user_entity_network_data = user_entity_network_data[col_order] - return user_entity_data, user_entity_network_data - -def gen_base_network_data(entity_networks, user_cols=['userid_x','userid_y']): - """ - """ - # create a base data of users from all entity networks - base_data = pd.concat(objs=[df[user_cols] for df in entity_networks], ignore_index=True, axis=0).drop_duplicates().reset_index(drop=True) - # generate full base data by joining on all entity networks - for entity_network in entity_networks: - base_data = pd.merge(left=base_data, right=entity_network, on=user_cols, how='left') - return base_data - -# generate share user entity networks -user_device_data, user_device_network_data = gen_entity_network_data(data=data, entity='device_hash', userid = 'userid') -user_ip_data, user_ip_network_data = gen_entity_network_data(data=data, entity='ip_hash', userid = 'userid') -user_card_data, user_card_network_data = gen_entity_network_data(data=data, entity='card_hash', userid = 'userid') -# generate base entity network data -entity_networks = [user_device_network_data, user_ip_network_data, user_card_network_data] -base_data = gen_base_network_data(entity_networks=entity_networks, user_cols=['userid_x','userid_y']) -``` - -# Network Analysis - -```{python} -#| label: network-analysis - -def gen_comp_data(network_data, entity_data, edge_attr): - """ - """ - # apply graphs for each week - trans_week_graphs = network_data.groupby(by='transaction_week').apply(lambda group: nx.from_pandas_edgelist(df = group, source = 'userid_x', target = 'userid_y', edge_attr = [edge_attr])).rename('G').reset_index() - # extract connected components for each week - trans_week_comps = trans_week_graphs.apply(lambda series: pd.DataFrame([{'transaction_week':series['transaction_week'], 'compid':i, 'userid':cc} for i, cc in enumerate(nx.connected_components(series['G']))]).explode('userid').reset_index(drop = True), axis=1).to_list() - trans_week_comps = pd.concat(trans_week_comps, axis=0) - # calculate compid sizes across each week - trans_week_comps_size = trans_week_comps.groupby(by = ['transaction_week', 'compid'], as_index = False).agg({'userid':'nunique'}).rename(columns={'userid':'compsize'}) - # generate the component data - comp_data = pd.merge(left = entity_data, right = trans_week_comps, left_on = ['transaction_week', 'userid'], right_on = ['transaction_week', 'userid'], how = 'inner') - comp_data = pd.merge(left = comp_data, right = trans_week_comps_size, on = ['transaction_week', 'compid'], how = 'inner') - # order by comp size - comp_data = comp_data.sort_values(by = ['transaction_week', 'compid', 'userid', edge_attr]).reset_index(drop=True) - # normalise data with respect to edge attribute - comp_data = comp_data.rename(columns={edge_attr:'idhashes'}) - comp_data['type'] = edge_attr - return comp_data - -# generate components for all entities -user_device_comp_data = gen_comp_data(network_data=user_device_network_data, entity_data = user_device_data, edge_attr='device_hash') -user_ip_comp_data = gen_comp_data(network_data=user_ip_network_data, entity_data = user_ip_data, edge_attr='ip_hash') -user_card_comp_data = gen_comp_data(network_data=user_card_network_data, entity_data = user_card_data, edge_attr='card_hash') -# concatenate component data together -concat_objs = [user_device_comp_data, user_ip_comp_data, user_card_comp_data] -user_entity_comp_data = pd.concat(objs=concat_objs, axis=0, ignore_index=True) -``` - -# Graph Connected Component - -```{python} -#| label: connected-component -type_filter = user_entity_comp_data['type'].isin(['device_hash', 'card_hash', 'ip_hash']) -trans_week_filter = user_entity_comp_data['transaction_week'].isin([52]) -comp_data = user_entity_comp_data.loc[type_filter & trans_week_filter, :].copy() - -# generate grpah from component data -compids = comp_data['compid'].unique() -comp_data = comp_data.loc[comp_data['compid'].isin(compids), :] -G = nx.from_pandas_edgelist(df = comp_data, source = 'userid', target = 'idhashes', edge_attr = ['compid', 'compsize', 'type']) - -# define node colours -idhash_colours = comp_data.drop_duplicates(subset=['idhashes', 'type']).set_index('idhashes')['type'].replace({'device_hash':'orange', 'ip_hash':'yellow', 'card_hash':'red'}) -userid_colours = comp_data.drop_duplicates(subset=['userid']).set_index('userid').assign(colour='blue')['colour'] -colours_df = pd.concat(objs = [userid_colours, idhash_colours], axis=0).rename('colours').to_frame().reset_index().rename(columns={'index':'nodes'}) -# join color to nodes -node_colours_df = pd.Series(list(G), name='nodes').to_frame().merge(colours_df, on='nodes', how='left') - -# plot network -fig, ax = plt.subplots() -nx.draw_networkx(G, pos = nx.spring_layout(G), with_labels = True, node_size = 30, font_size = 1, node_color=node_colours_df['colours'].to_list(), ax=ax) -ax.set(title="Connected Component") -plt.show() -``` - -# Feature Data - -```{python} -# generate user network feature data -groupby_cols = ['userid', 'transaction_week'] -agg_dict = {'type':'nunique', 'compsize':'sum'} -rename_dict = {'type':'n_comps', 'compsize':'total_comp_size'} -feat_data = user_entity_comp_data.groupby(by=groupby_cols, as_index=False).agg(agg_dict).rename(columns=rename_dict) -# sort by n comps -feat_data = feat_data.sort_values(by=['transaction_week', 'userid']).reset_index(drop=True) -``` - -# Write Data - -```{python} -#| label: write data -feat_data_fpath=os.path.join('..', 'data', 'report', 'user_comp_data.csv') -feat_data.to_csv(feat_data_fpath, index=False) -``` \ No newline at end of file diff --git a/report/qa.qmd b/report/qa.qmd deleted file mode 100644 index 0498ac6..0000000 --- a/report/qa.qmd +++ /dev/null @@ -1,296 +0,0 @@ ---- -title: "QA Report" -format: - html: - code-fold: true -jupyter: python3 ---- - -# Random Telecom Payments QA Report - -```{python} -#| label: data-load -#| -import os -import sys -import pandas as pd -import seaborn as sns -import matplotlib.pyplot as plt - -# set file path for custom python modules -root_dir = os.path.dirname(os.path.join(os.getcwd())) -sys.path.append(os.path.join(root_dir, 'generator')) - -import cons - -# load data -pd.set_option('display.max_columns', None) -parse_dates = ['registration_date', 'transaction_date'] -date_parser = lambda x: datetime.strptime(x, '%Y-%m-%d') -data = pd.read_csv(cons.fpath_randomtelecomtransdata, parse_dates=parse_dates) -userdata = pd.read_parquet(cons.fpath_randomtelecomusersdata) - -# show head of data -data.head() -``` - -## Users - -Check that the data makes sense and there are no anomalies at a user level. - -### Unique UserIds per UID - -There shoule be exactly one unique userid for every UID. - -```{python} -#| label: nunique-userids-per-uid -nunique_userids_per_uid = data.groupby(by='uid').agg({'userid':'nunique'}) -sns.histplot(data=nunique_userids_per_uid,x='userid', bins = 20) -data['userid'].notnull().value_counts() -``` - -### Unique Fullnames per UID - -There should be exactly one unique fullname for every UID. - -```{python} -#| label: nunique-names-per-uid -tmp_data = data.copy() -tmp_data['fullname'] = tmp_data['firstname'] + ' ' + tmp_data['lastname'] -nunique_names_per_uid = tmp_data.groupby(['userid']).agg({'fullname':'nunique'}) -sns.histplot(data=nunique_names_per_uid,x='fullname', bins = 20) -tmp_data['fullname'].notnull().value_counts() -``` - -### Unique Registration Dates per UID - -A user should register only on a single date. - -```{python} -#| label: nunique-regdates-per-uid -nunique_regdate_per_uid = data.groupby(by='uid').agg({'registration_date':'nunique'}) -sns.histplot(data=nunique_regdate_per_uid,x='registration_date', bins = 20) -data['registration_date'].notnull().value_counts() -``` - -### Unique Registration Countries per UID - -When registering the user should set their country code of residence. - -```{python} -#| label: nunique-regcountries-per-uid -nunique_regcountry_per_uid = data.groupby(by='uid').agg({'registration_country_code':'nunique'}) -sns.histplot(data=nunique_regcountry_per_uid,x='registration_country_code', bins = 20) -data['registration_country_code'].notnull().value_counts() -``` - -### Unique Email Domains per UID - -A user should register with a single email address corresponding to a single email domain. - -```{python} -#| label: nunique-emaildomains-per-uid -nunique_emaildomains_per_uid = data.groupby(by='uid').agg({'email_domain':'nunique'}) -sns.histplot(data=nunique_emaildomains_per_uid,x='email_domain', bins = 20) -data['email_domain'].notnull().value_counts() -``` - -### Unique Device Hash per UID - -A UID should have 1 to 3 devices. - -```{python} -#| label: nunique-devices-per-uid -nunique_devices_per_uid = data.groupby(by='uid').agg({'device_hash':'nunique'}) -sns.histplot(data=nunique_devices_per_uid,x='device_hash', bins = 20) -data['device_hash'].notnull().value_counts() -``` - -### Unique Card Hash per UID - -A UID should have 1 to 2 cards, with an overall distribution less than the corresponding device hash distribution. - -```{python} -#| label: nunique-cards-per-uid -nunique_cards_per_uid = data.groupby(by='uid').agg({'card_hash':'nunique'}) -sns.histplot(data=nunique_cards_per_uid,x='card_hash', bins = 20) -data['card_hash'].notnull().value_counts() -``` - -### Unique IP Hash per UID - -A UID should have between 1 and 10 ips. - -```{python} -#| label: nunique-ips-per-uid -nunique_ips_per_uid = data.groupby(by='uid').agg({'ip_hash':'nunique'}) -sns.histplot(data=nunique_ips_per_uid,x='ip_hash', bins = 10) -data['ip_hash'].notnull().value_counts() -``` - -### Unique Application Hash per UID - -```{python} -#| label: nunique-apps-per-uid -nunique_apps_per_uid = data.groupby(by='uid').agg({'application_hash':'nunique'}) -sns.histplot(data=nunique_apps_per_uid,x='application_hash', bins = 10) -data['ip_hash'].notnull().value_counts() -``` - -### Unique Transaction Hash per UID - -```{python} -#| label: nunique-ips-per-uid -nunique_ips_per_uid = data.groupby(by='uid').agg({'transaction_hash':'nunique'}) -sns.histplot(data=nunique_ips_per_uid,x='transaction_hash', bins=10) -data.assign(transaction_hash=data['transaction_hash'].notnull().astype(int)).groupby(by=['transaction_hash', 'transaction_payment_method'], as_index=False, dropna=False).agg({'transaction_amount':'sum'}) -``` - -## Transaction - -### Unique Transaction Hash - -```{python} -data['transaction_hash'].value_counts().sort_values(ascending=False).head() -``` - -### Unique Date per Transaction Hash - -Each transaction hash should have a single date associated with it - -```{python} -#| label: nunique-dates-per-trans -nunique_transdates_per_trans = data.groupby(by=['transaction_hash']).agg({'transaction_date':'nunique'}) -sns.histplot(data=nunique_transdates_per_trans,x='transaction_date', bins = 20) -``` - -### Unique Amount per Transaction Hash - -Each transaction hash should have a single transaction amount associated with it - -```{python} -#| label: nunique-transamount-per-trans -nunique_transamounts_per_trans = data.groupby(by=['transaction_hash']).agg({'transaction_amount':'nunique'}) -sns.histplot(data=nunique_transamounts_per_trans,x='transaction_amount', bins = 20) -``` - -### Unique Payment Method per Transaction Hash - -Each transaction hash should have a single transaction payment method associated with it. Note, in some circumstances the payment method is missing as the transaction amount was 0. - -```{python} -#| label: nunique-paymentmethod-per-trans -nunique_paymentmethod_per_trans = data.groupby(by=['transaction_hash']).agg({'transaction_payment_method':'nunique'}).sort_values('transaction_payment_method') -sns.histplot(data=nunique_paymentmethod_per_trans,x='transaction_payment_method', bins = 20) -data.loc[data['transaction_hash'].isin(nunique_paymentmethod_per_trans[(nunique_paymentmethod_per_trans == 0)].index[:5]), :] -data.loc[data['transaction_hash'].isin(nunique_paymentmethod_per_trans[(nunique_paymentmethod_per_trans == 0)].index[:5]), :] -data.groupby(by=['card_type', 'transaction_payment_method'], as_index=False, dropna=False).agg({'transaction_hash':'size', 'transaction_amount':'sum'}) -``` - -### Unique Payment Channel per Transaction Hash - -Each transaction hash should have a single transaction payment channel with it. Note in some circumstances the payment channel is missing when the transaction amount is 0, or the payment method is wallet or points. - -```{python} -#| label: nunique-paymentchannel-per-trans -nunique_paymentchannel_per_trans = data.groupby(by=['transaction_hash']).agg({'card_payment_channel':'nunique'}).sort_values('card_payment_channel') -sns.histplot(data=nunique_paymentchannel_per_trans,x='card_payment_channel', bins = 20) -data.groupby(by=['transaction_payment_method', 'card_payment_channel'], as_index=False, dropna=False).agg({'transaction_hash':'size', 'transaction_amount':'sum'}) -``` - -### Unique Transaction Status per Transaction Hash - -Each transaction hash should have a single unique payment status associated with it. - -```{python} -#| label: nunique-transstatus-per-trans -nunique_transstatus_per_trans = data.groupby(by=['transaction_hash']).agg({'transaction_status':'nunique'}) -sns.histplot(data=nunique_transstatus_per_trans,x='transaction_status', bins = 20) -data.groupby(by=['transaction_error_code', 'transaction_status'], as_index=False, dropna=False).size() -``` - -### Unique Error Codes Status per Transaction Hash - -An error code should only be associated with transaction hashes with a failed payment status. - -```{python} -#| label: nunique-errorcodes-per-trans -nunique_errorcodes_per_trans = data.groupby(by=['transaction_hash']).agg({'transaction_error_code':'nunique'}) -sns.histplot(data=nunique_errorcodes_per_trans,x='transaction_error_code', bins = 20) -data.groupby(by=['transaction_error_code', 'transaction_status'], as_index=False, dropna=False).size() -``` - -### UIDs with High Device Hash Counts - -```{python} -#| label: uid-maxdevice-trans-error-counts -nunique_devices_per_uid = data.groupby(by='uid', as_index=False).agg({'device_hash':'nunique'}).sort_values(by='device_hash') -uids_max_devices = data.loc[data['uid'].isin(nunique_devices_per_uid['uid'].tail()), :].sort_values(by=['uid', 'device_hash', 'transaction_date']) - -uids_max_devices.groupby(by=['userid'], as_index=False).agg({'device_hash':'nunique', 'transaction_hash':'count', 'transaction_error_code':'count'}) - -uids_max_devices.groupby(by=['transaction_error_code'], as_index=False).size().sort_values(by='size', ascending=False) -``` - -### UIDs with High Card Hash Counts - -```{python} -#| label: uid-maxcard-trans-error-counts -nunique_cards_per_uid = data.groupby(by='uid', as_index=False).agg({'card_hash':'nunique'}).sort_values(by='card_hash') -uids_max_cards = data.loc[data['uid'].isin(nunique_devices_per_uid['uid'].tail()), :].sort_values(by=['uid', 'card_hash', 'transaction_date']) - -uids_max_cards.groupby(by=['userid'], as_index=False).agg({'card_hash':'nunique', 'transaction_hash':'count', 'transaction_error_code':'count'}) - -uids_max_cards.groupby(by=['transaction_error_code'], as_index=False).size().sort_values(by='size', ascending=False) -``` - - -### UIDs with IP Hash Counts - -```{python} -#| label: uid-maxip-trans-error-counts -nunique_ips_per_uid = data.groupby(by='uid', as_index=False).agg({'ip_hash':'nunique'}).sort_values(by='ip_hash') -uids_max_ips = data.loc[data['uid'].isin(nunique_ips_per_uid['uid'].tail()), :].sort_values(by=['uid', 'ip_hash', 'transaction_date']) - -uids_max_ips.groupby(by=['userid'], as_index=False).agg({'ip_hash':'nunique', 'transaction_hash':'count', 'transaction_error_code':'count'}) - -uids_max_ips.groupby(by=['transaction_error_code'], as_index=False).size().sort_values(by='size', ascending=False) -``` - -## Card - -### Unique Card Types per Card Hashes - -Each card should have a single card type associated with it. - -```{python} -#| label: nunique-cardtypes-per-card -nunique_cardtypes_per_card = data.groupby(by=['card_hash']).agg({'card_type':'nunique'}) -sns.histplot(data=nunique_cardtypes_per_card,x='card_type', bins = 20) -data.groupby(by=['card_type', 'transaction_payment_method'], as_index=False, dropna=False).agg({'transaction_hash':'size', 'transaction_amount':'sum'}) -``` - -### Unique Country Code per Card Hashes - -Each card should have a single country code associated with it. - -```{python} -#| label: nunique-countrycodes-per-card -nunique_countrycodes_per_card = data.groupby(by=['card_hash']).agg({'card_country_code':'nunique'}) -sns.histplot(data=nunique_countrycodes_per_card,x='card_country_code', bins = 20) -#data.groupby(by=['card_country_code'], as_index=False, dropna=False).size() -data.assign(card_country_code=data['card_country_code'].notnull().astype(int)).groupby(by=['card_country_code', 'transaction_payment_method'], as_index=False, dropna=False).agg({'transaction_hash':'size', 'transaction_amount':'sum'}) -``` - -## IP - -### Unique Country Codes per IP Hashes - -Each IP should have a single country code associated with it. - -```{python} -#| label: nunique-countrycodes-per-ip -nunique_countrycodes_per_ip = data.groupby(by=['ip_hash']).agg({'ip_country_code':'nunique'}) -sns.histplot(data=nunique_countrycodes_per_ip,x='ip_country_code', bins = 20) -data.groupby(by=['ip_country_code'], as_index=False, dropna=False).size() -``` diff --git a/report/qmarkdown/.gitignore b/report/qmarkdown/.gitignore new file mode 100644 index 0000000..3476f27 --- /dev/null +++ b/report/qmarkdown/.gitignore @@ -0,0 +1,2 @@ +/.quarto/ +/docs/ \ No newline at end of file diff --git a/report/qmarkdown/_quarto.yml b/report/qmarkdown/_quarto.yml new file mode 100644 index 0000000..7f0c914 --- /dev/null +++ b/report/qmarkdown/_quarto.yml @@ -0,0 +1,3 @@ +project: + type: default + output-dir: docs \ No newline at end of file diff --git a/report/qmarkdown/customer_value_score.qmd b/report/qmarkdown/customer_value_score.qmd new file mode 100644 index 0000000..b10dfcb --- /dev/null +++ b/report/qmarkdown/customer_value_score.qmd @@ -0,0 +1,94 @@ +--- +title: "Customer Value Score" +format: + html: + code-fold: true +jupyter: python3 +--- + +```{python} +#| label: set-up +import os +import numpy as np +import pandas as pd +import os +import sys + +sys.path.append(os.getcwd()) +sys.path.append(os.path.dirname(os.getcwd())) +sys.path.append(os.path.dirname(os.path.dirname(os.path.join(os.getcwd())))) + +import generator.cons as cons +from report.qmarkdown.utilities.value_score import week_pct_score, gen_weekly_user_scores, apply_cum_sum + +data_dir = os.path.join('..', '..', 'data') +data_report_dir = os.path.join(data_dir, 'report') +``` + +# Data Load + +Load the engineered feature data. + +```{python} +#| label: data-load + +# load user feature data +user_feat_fpath = os.path.join(data_report_dir, 'user_feat_data.csv') +feat_data = pd.read_csv(user_feat_fpath) +feat_data.head() +``` + +# Weekly Customer Value Score + +Percentile score / rank the users across each week number based on their successful transaction count, and amount total. The goal being to score the top purchasing users across each week. + +```{python} +#| label: weekly-score-data + +# only consider users who made at least one transaction in any given week +score_data_week = feat_data.loc[feat_data['Successful_size'] > 0, :].copy() +# score each user across each week for their percentile score in number of successfully transactions counts and accounts +score_data_week = score_data_week.join(score_data_week.groupby(by=['transaction_week'], group_keys=False)[['Successful_size','Successful_sum']].apply(lambda group: week_pct_score(group))) +# group by each user and apply a cumulative sum to determine weekly values scores over time +score_data_week = score_data_week.join(score_data_week.groupby(by=['userid'], group_keys=False)[['Successful_size_pct', 'Successful_sum_pct']].apply(lambda group: gen_weekly_user_scores(group))) +score_data_week.head(10) +``` + +# Global Customer Value Score + +Calculate the global customer value score which is the cumulative sum of all weekly customer value scores. + +```{python} +#| label: global-value-score + +# apply cumulative sum to generate value scores cumulative increase over time +score_data_week = score_data_week.join(score_data_week.groupby(by=['userid'], group_keys=False)[['transaction_week', 'customer_value_score']].apply(lambda group: apply_cum_sum(group))) +score_data_week.head(10) +``` + +# Create Base Value Score Dataset + +Generate a base dataset which spreads all users across all known weeks with their associated weekly engineered features at that time. + +```{python} +#| label: base-data + +# create base data +base_score_data = pd.merge(left=feat_data, right=score_data_week, on=['userid','transaction_week'], how='left') +# fill missing pct and score values as 0 +base_score_data = base_score_data.fillna(0) +# subset out required columns +sub_cols = ["userid", "transaction_week", "Successful_size_pct", "Successful_sum_pct", "customer_value_score", "customer_value_score_cum_sum"] +base_score_data = base_score_data[sub_cols] +base_score_data.head(10) +``` + +# Write Data to Disk + +```{python} +#| label: write data + +# write customer value score to disk +user_score_data_fpath=os.path.join(data_report_dir, 'customer_value_score.csv') +base_score_data.to_csv(user_score_data_fpath, index=False) +``` diff --git a/report/qmarkdown/entity_count_dists.qmd b/report/qmarkdown/entity_count_dists.qmd new file mode 100644 index 0000000..c9f8e32 --- /dev/null +++ b/report/qmarkdown/entity_count_dists.qmd @@ -0,0 +1,84 @@ +--- +title: "Poisson Power Entity Count Distributions" +format: + html: + code-fold: true +jupyter: python3 +--- + +# Poisson Power Entity Count Distributions + +```{python} +#| label: set-up +import sys +import os + +sys.path.append(os.getcwd()) +sys.path.append(os.path.dirname(os.getcwd())) +sys.path.append(os.path.dirname(os.path.dirname(os.path.join(os.getcwd())))) + +import generator.cons as cons +from generator.utilities.gen_random_poisson_power import gen_random_poisson_power +from report.qmarkdown.utilities.plotting import distribution_plot + +# set the number of users to generate random entity counts for +n_users = 10000 +``` + +This report aims to illustrate the underlying Poisson Power Distributions used to generate the random entity counts for each user object group; Devices, Cards, Ips, Applications and Transactions. The goal being to replicate what the real life count distributions would be for these user based entity counts. + +# Device + +The count of used devices per userid. + +```{python} +#| label: device-count-per-userid +device_poisson_params = cons.data_model_poisson_params['device'] +device_cnts_per_user = gen_random_poisson_power(lam=device_poisson_params['lambda'], size=n_users, power=device_poisson_params['power']) +distribution_plot(data=device_cnts_per_user, title="Random Poisson Power Distribution for Devices") + +``` + +# Card + +The count of used cards per userid. + +```{python} +#| label: card-count-per-userid +card_poisson_params = cons.data_model_poisson_params['card'] +card_cnts_per_user = gen_random_poisson_power(lam=card_poisson_params['lambda'], size=n_users, power=card_poisson_params['power']) +distribution_plot(data=card_cnts_per_user, title="Random Poisson Power Distribution for Cards") +``` + +# IP + +The count of used IPs per userid. + +```{python} +#| label: ip-count-per-userid +ip_poisson_params = cons.data_model_poisson_params['ip'] +ip_cnts_per_user = gen_random_poisson_power(lam=ip_poisson_params['lambda'], size=n_users, power=ip_poisson_params['power']) +distribution_plot(data=ip_cnts_per_user, title="Random Poisson Power Distribution for Ips") +``` + +# Application + +The count of applications per userid. + +```{python} +#| label: application-count-per-userid +app_poisson_params = cons.data_model_poisson_params['application'] +app_cnts_per_user = gen_random_poisson_power(lam=app_poisson_params['lambda'], size=n_users, power=app_poisson_params['power']) +distribution_plot(data=app_cnts_per_user, title="Random Poisson Power Distribution for Applications") +``` + +# Transaction + +The count of transactions per userid. + +```{python} +#| label: transaction-count-per-userid +trans_poisson_params = cons.data_model_poisson_params['transaction'] +trans_cnts_per_user = gen_random_poisson_power(lam=trans_poisson_params['lambda'], size=n_users, power=trans_poisson_params['power']) +distribution_plot(data=trans_cnts_per_user, title="Random Poisson Power Distribution for Transactions") +``` \ No newline at end of file diff --git a/report/feature_engineering.qmd b/report/qmarkdown/feature_engineering.qmd similarity index 60% rename from report/feature_engineering.qmd rename to report/qmarkdown/feature_engineering.qmd index 2c12923..26a5827 100644 --- a/report/feature_engineering.qmd +++ b/report/qmarkdown/feature_engineering.qmd @@ -1,3 +1,11 @@ +--- +title: "Feature Engineering" +format: + html: + code-fold: true +jupyter: python3 +--- + ```{python} #| label: set-up import os @@ -12,52 +20,41 @@ from sklearn.ensemble import IsolationForest sys.path.append(os.getcwd()) sys.path.append(os.path.dirname(os.getcwd())) +sys.path.append(os.path.dirname(os.path.dirname(os.path.join(os.getcwd())))) import generator.cons as cons +from report.qmarkdown.utilities.feature_engineer import feature_engineer, merge_features + +data_dir = os.path.join('..', '..', 'data') +data_arch_dir = os.path.join(data_dir, 'arch') +data_report_dir = os.path.join(data_dir, 'report') ``` # Load Random Telecom Payments Data ```{python} #| label: data-load + # load random telecom payments data -data_fpath=os.path.join('..', 'data', 'arch', 'RandomTelecomPaymentsV1.1.csv') parse_dates = ['registration_date', 'transaction_date'] date_parser = lambda x: datetime.strptime(x, '%Y-%m-%d') -data = pd.read_csv(filepath_or_buffer=data_fpath, parse_dates=parse_dates) - -# determine the week number for all transaction dates -data['transaction_week'] = data['transaction_date'].dt.isocalendar().week +data = pd.read_csv(filepath_or_buffer=cons.fpath_arch_randomtelecomdata, parse_dates=parse_dates) ``` # Engineer User Features per Week +For each transaction date week number, variety of sum and count features across different levels and entities within the dataset. +- Total user transaction counts and amounts by error code +- Total user transaction counts and amounts by status code +- Total user device counts +- Total user card counts +- Total user ip counts + ```{python} #| label: feature-engineering -def feature_engineer(data, ids, groups, target, func): - """ - """ - # aggregate across the ids and group, applying the function to the target - data_agg = data.copy().groupby(by=ids+groups, as_index=False).agg({target:func}) - # pivot the target results across each group - data_pivot = pd.pivot_table(data=data_agg, index=ids, values=target, columns=groups) - # rename and format the columns - data_pivot.columns = data_pivot.columns.str.split(':').str[0] + f'_{func}' - data_pivot = data_pivot.reset_index() - data_pivot.columns.name = None - return data_pivot - - -def merge_features(feat_objs): - """ - """ - feat_data = pd.DataFrame(columns=['userid', 'transaction_week']) - # join objects - for feat_obj in feat_objs: - feat_data = pd.merge(left=feat_data, right=feat_obj, how='outer', on=['userid', 'transaction_week']) - # fill for missing values - feat_data = feat_data.fillna(0) - return feat_data + +# determine the week number for all transaction dates +data['transaction_week'] = data['transaction_date'].dt.isocalendar().week # user trans error counts and sums userid_error_cnt_data = feature_engineer(data=data, ids=['userid', 'transaction_week'], groups=['transaction_error_code'], target='transaction_amount', func='size') @@ -73,18 +70,21 @@ userid_card_cnt_data = feature_engineer(data=data, ids=['userid', 'transaction_w userid_ip_cnt_data = feature_engineer(data=data, ids=['userid', 'transaction_week'], groups=[], target='ip_hash', func='size') # join all user feature datasets together -feat_objs = [userid_device_cnt_data, userid_card_cnt_data, userid_ip_cnt_data, userid_status_cnt_data, userid_status_sum_data, userid_error_cnt_data, userid_error_sum_data] -feat_data = merge_features(feat_objs) +feat_objects = [userid_device_cnt_data, userid_card_cnt_data, userid_ip_cnt_data, userid_status_cnt_data, userid_status_sum_data, userid_error_cnt_data, userid_error_sum_data] +feat_data = merge_features(feat_objects) ``` # Create the Base User Feature Dataset +Generate a base dataset which spreads all users across all known weeks with their associated weekly engineered features at that time. + ```{python} #| label: base-data + # create base data base_user_data = data[['userid']].drop_duplicates().reset_index(drop=True).assign(key = 1).sort_values(by='userid') -base_transweek_data = data[['transaction_week']].drop_duplicates().reset_index(drop=True).assign(key = 1).sort_values(by='transaction_week') -base_data = pd.merge(left=base_user_data, right=base_transweek_data, on='key', how='inner').drop(columns=['key']) +base_trans_week_data = data[['transaction_week']].drop_duplicates().reset_index(drop=True).assign(key = 1).sort_values(by='transaction_week') +base_data = pd.merge(left=base_user_data, right=base_trans_week_data, on='key', how='inner').drop(columns=['key']) base_feat_data = pd.merge(left=base_data, right=feat_data, on=['userid','transaction_week'], how='left') # fill 0 for any weeks missing user feature data base_feat_data = base_feat_data.fillna(0) @@ -94,6 +94,8 @@ base_feat_data = base_feat_data.fillna(0) ```{python} #| label: write data -feat_data_fpath=os.path.join('..', 'data', 'report', 'user_feat_data.csv') + +# write feature data to disk +feat_data_fpath=os.path.join(data_report_dir, 'user_feat_data.csv') base_feat_data.to_csv(feat_data_fpath, index=False) ``` \ No newline at end of file diff --git a/report/qmarkdown/isolation_forests.qmd b/report/qmarkdown/isolation_forests.qmd new file mode 100644 index 0000000..b9a67f2 --- /dev/null +++ b/report/qmarkdown/isolation_forests.qmd @@ -0,0 +1,116 @@ +--- +title: "Isolation Forests" +format: + html: + code-fold: true +jupyter: python3 +--- + +```{python} +#| label: set-up +import os +import numpy as np +import pandas as pd +import sklearn as sk +import shap +import os +import sys +from datetime import datetime + +sys.path.append(os.getcwd()) +sys.path.append(os.path.dirname(os.getcwd())) +sys.path.append(os.path.dirname(os.path.dirname(os.path.join(os.getcwd())))) + +import generator.cons as cons +from report.qmarkdown.utilities.anomaly_score import IsolationForestsModel, gen_anomaly_score, gen_cumulative_anomaly_score + +data_dir = os.path.join('..', '..', 'data') +data_report_dir = os.path.join(data_dir, 'report') +``` + +# Data Load + +Load in the engineered feature data and connected user component data for identifying abnormal user behaviour. + +```{python} +#| label: data-load + +# load user feature data +user_feat_fpath = os.path.join(data_report_dir, 'user_feat_data.csv') +feat_data = pd.read_csv(user_feat_fpath) + +# load network data +comp_data_fpath=os.path.join(data_report_dir, 'user_comp_data.csv') +comp_data = pd.read_csv(comp_data_fpath) + +# join feature and component data +join_cols = ['userid', 'transaction_week'] +model_data = pd.merge(left=feat_data, right=comp_data, on=join_cols, how='left').fillna(0) +# order model date by transaction week and user ids +order_cols = ['transaction_week', 'userid'] +model_data = model_data.sort_values(by=order_cols).reset_index(drop=True) +``` + +# Isolation Forests Model + +Isolation Forests is an anomaly detection algorithm which applies recursive partitions to isolate abnormal values within a multi-dimensional space. + +```{python} +#| label: isolation forests model + +# initiate isolation forest model +model = IsolationForestsModel(n_estimators=5, random_state=None, n_jobs=2, warm_start=True) + +# write load to disk +model_fpath = os.path.join('..', '..', 'data', 'report', 'isolation_forests_model.pickle') +model.write(model_fpath) +``` + +# Score Data + +Score the user behaviour data using the Isolation Forests model. + +```{python} +#| label: score-data + +# apply isolation forests model across each transaction week +score_data = model_data.groupby(by=['transaction_week'], group_keys=False, as_index=False).apply(lambda group: gen_anomaly_score(group, model_fpath=model_fpath)) +# generate anomaly score +anomaly_data = score_data.groupby(by=['userid'], group_keys=False, as_index=False).apply(lambda group: gen_cumulative_anomaly_score(group)) +# sort data by most anomalous +sort_cols = ['userid', 'transaction_week'] +anomaly_data = anomaly_data.sort_values(by=sort_cols).reset_index(drop=True) +anomaly_data['black_list_users'] = (anomaly_data['anomaly_score'] < 0).astype(int) +``` + +# Write Anomalous Data + +Write the anomalous data to disk. + +```{python} +#| label: write-data +# save user anomaly data to disk +score_data_users_fpath=os.path.join('..', '..', 'data', 'report', 'user_anomaly_data.csv') +anomaly_data.to_csv(score_data_users_fpath, index=False) +``` + +# Evaluate Model + +Perform a SHAP analysis to understand how the Isolation Forests isolates and identifies abnormal values. + +```{python} +# initiate and load isolation forest model +model = IsolationForestsModel() +model_fpath = os.path.join('..', '..', 'data', 'report', 'isolation_forests_model.pickle') +model = model.read(model_fpath) +# load scored data +score_data_users_fpath=os.path.join('..', '..', 'data', 'report', 'user_anomaly_data.csv') +anomaly_data = pd.read_csv(score_data_users_fpath) +eval_data = anomaly_data.loc[anomaly_data['transaction_week'] == 52, :].copy() +# generate shap bee swarm plot +pred = (eval_data['anomaly_score'] < 0).astype(int) +explainer = shap.TreeExplainer(model.model) +X_cols = ['E901_size', 'E901_sum', 'E902_size', 'E902_sum', 'n_comps', 'total_comp_size'] +explanation = explainer(eval_data[X_cols]) +shap.plots.beeswarm(explanation) +``` diff --git a/report/qmarkdown/network.qmd b/report/qmarkdown/network.qmd new file mode 100644 index 0000000..179193e --- /dev/null +++ b/report/qmarkdown/network.qmd @@ -0,0 +1,127 @@ +--- +title: "Networks" +format: + html: + code-fold: true +jupyter: python3 +--- + +```{python} +#| label: set-up +import os +import numpy as np +import pandas as pd +import sklearn as sk +import networkx as nx +import matplotlib.pyplot as plt +import shap +import os +import sys +from datetime import datetime +from sklearn.ensemble import IsolationForest + +sys.path.append(os.getcwd()) +sys.path.append(os.path.dirname(os.getcwd())) +sys.path.append(os.path.dirname(os.path.dirname(os.path.join(os.getcwd())))) + +import generator.cons as cons +from report.qmarkdown.utilities.networks import gen_entity_network_data, gen_base_network_data, gen_comp_data + +data_dir = os.path.join('..', '..', 'data') +data_arch_dir = os.path.join(data_dir, 'arch') +data_report_dir = os.path.join(data_dir, 'report') +``` + +# Load Random Telecom Payments Data + +```{python} +#| label: data-load +# load random telecom payments data +parse_dates = ['registration_date', 'transaction_date'] +date_parser = lambda x: datetime.strptime(x, '%Y-%m-%d') +data = pd.read_csv(filepath_or_buffer=cons.fpath_arch_randomtelecomdata, parse_dates=parse_dates) + +# determine the week number for all transaction dates +data['transaction_week'] = data['transaction_date'].dt.isocalendar().week +``` + +# Data Preparation + +Generate different networks connecting the various entities to their corresponding users; devices, ips and cards. Combine the networks on the linked users attributing their shared devices, ips and / or cards. + +```{python} +#| label: data-preparation + +# generate share user entity networks +user_device_data, user_device_network_data = gen_entity_network_data(data=data, entity='device_hash', userid = 'userid') +user_ip_data, user_ip_network_data = gen_entity_network_data(data=data, entity='ip_hash', userid = 'userid') +user_card_data, user_card_network_data = gen_entity_network_data(data=data, entity='card_hash', userid = 'userid') +# generate base entity network data +entity_networks = [user_device_network_data, user_ip_network_data, user_card_network_data] +base_data = gen_base_network_data(entity_networks=entity_networks, user_cols=['userid_x','userid_y']) +``` + +# Network Analysis + +Identify the connected components of user groups who share common devices, ips and / or cards. + +```{python} +#| label: network-analysis + +# generate components for all entities +user_device_comp_data = gen_comp_data(network_data=user_device_network_data, entity_data = user_device_data, edge_attr='device_hash') +user_ip_comp_data = gen_comp_data(network_data=user_ip_network_data, entity_data = user_ip_data, edge_attr='ip_hash') +user_card_comp_data = gen_comp_data(network_data=user_card_network_data, entity_data = user_card_data, edge_attr='card_hash') +# concatenate component data together +concat_objects = [user_device_comp_data, user_ip_comp_data, user_card_comp_data] +user_entity_comp_data = pd.concat(objs=concat_objects, axis=0, ignore_index=True) +``` + +# Graph Connected Component + +```{python} +#| label: connected-component +type_filter = user_entity_comp_data['type'].isin(['device_hash', 'card_hash', 'ip_hash']) +trans_week_filter = user_entity_comp_data['transaction_week'].isin([52]) +comp_data = user_entity_comp_data.loc[type_filter & trans_week_filter, :].copy() + +# generate graph from component data +comp_ids = comp_data['comp_id'].unique() +comp_data = comp_data.loc[comp_data['comp_id'].isin(comp_ids), :] +G = nx.from_pandas_edgelist(df = comp_data, source = 'userid', target = 'idhashes', edge_attr = ['comp_id', 'comp_size', 'type']) + +# define node colours +idhash_colours = comp_data.drop_duplicates(subset=['idhashes', 'type']).set_index('idhashes')['type'].replace({'device_hash':'orange', 'ip_hash':'yellow', 'card_hash':'red'}) +userid_colours = comp_data.drop_duplicates(subset=['userid']).set_index('userid').assign(colour='blue')['colour'] +colours_df = pd.concat(objs = [userid_colours, idhash_colours], axis=0).rename('colours').to_frame().reset_index().rename(columns={'index':'nodes'}) +# join colour to nodes +node_colours_df = pd.Series(list(G), name='nodes').to_frame().merge(colours_df, on='nodes', how='left') + +# plot network +fig, ax = plt.subplots() +nx.draw_networkx(G, pos = nx.spring_layout(G), with_labels = True, node_size = 30, font_size = 1, node_color=node_colours_df['colours'].to_list(), ax=ax) +ax.set(title="Connected Component") +plt.show() +``` + +# Feature Data + +```{python} +# generate user network feature data +groupby_cols = ['userid', 'transaction_week'] +agg_dict = {'type':'nunique', 'comp_size':'sum'} +rename_dict = {'type':'n_comps', 'comp_size':'total_comp_size'} +feat_data = user_entity_comp_data.groupby(by=groupby_cols, as_index=False).agg(agg_dict).rename(columns=rename_dict) +# sort by n comps +feat_data = feat_data.sort_values(by=['transaction_week', 'userid']).reset_index(drop=True) +``` + +# Write Data + +Write the user component data to disk. + +```{python} +#| label: write data +feat_data_fpath=os.path.join(data_report_dir, 'user_comp_data.csv') +feat_data.to_csv(feat_data_fpath, index=False) +``` \ No newline at end of file diff --git a/report/qmarkdown/qa.qmd b/report/qmarkdown/qa.qmd new file mode 100644 index 0000000..74e88cd --- /dev/null +++ b/report/qmarkdown/qa.qmd @@ -0,0 +1,254 @@ +--- +title: "QA Report" +format: + html: + code-fold: true +jupyter: python3 +--- + +# Random Telecom Payments QA Report + +```{python} +#| label: data-load + +import logging +import os +import sys +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt + +# set up logging +lgr = logging.getLogger() +lgr.setLevel(logging.INFO) + +# set file path for custom python modules +root_dir = os.path.dirname(os.path.dirname(os.path.join(os.getcwd()))) +sys.path.append(os.path.join(root_dir, 'generator')) + +import cons +import generator.qa as qa + +# load data +pd.set_option('display.max_columns', None) +parse_dates = ['registration_date', 'transaction_date'] +date_parser = lambda x: datetime.strptime(x, '%Y-%m-%d') +data = pd.read_csv(cons.fpath_randomtelecomtransdata, parse_dates=parse_dates) +userdata = pd.read_parquet(cons.fpath_randomtelecomusersdata) + +# show head of data +data.info() + +# initialise QA object +qaUids = qa.Uids(data=data, show_logs=True, show_plots=True) +qaTransactions = qa.Transactions(data=data, show_logs=True, show_plots=True) +qaCards = qa.Cards(data=data, show_logs=True, show_plots=True) +qaIps = qa.Ips(data=data, show_logs=True, show_plots=True) +``` + +## Users + +Check that the data makes sense and there are no anomalies at a user level. + +### Unique UserIds per UID + +There should be exactly one unique userid for every UID. + +```{python} +#| label: nunique-user-ids-per-uid +qaUids.unique_user_ids() +``` + +### Unique Full Names per UID + +There should be exactly one unique fullname for every UID. + +```{python} +#| label: nunique-names-per-uid +qaUids.unique_names() +``` + +### Unique Registration Dates per UID + +A user should register only on a single date. + +```{python} +#| label: nunique-reg-dates-per-uid +qaUids.unique_reg_dates() +``` + +### Unique Registration Countries per UID + +When registering the user should set their country code of residence. + +```{python} +#| label: nunique-reg-countries-per-uid +qaUids.nunique_reg_countries() +``` + +### Unique Email Domains per UID + +A user should register with a single email address corresponding to a single email domain. + +```{python} +#| label: nunique-email-domains-per-uid +qaUids.unique_email_domains() +``` + +### Unique Device Hash per UID + +A UID should have 1 to 3 devices. + +```{python} +#| label: nunique-devices-per-uid +qaUids.unique_devices() +``` + +### Unique Card Hash per UID + +A UID should have 1 to 2 cards, with an overall distribution less than the corresponding device hash distribution. + +```{python} +#| label: nunique-cards-per-uid +qaUids.unique_cards() +``` + +### Unique IP Hash per UID + +A UID should have between 1 and 10 ips. + +```{python} +#| label: nunique-ips-per-uid +qaUids.unique_ips() +``` + +### Unique Application Hash per UID + +```{python} +#| label: nunique-apps-per-uid +qaUids.unique_apps() +``` + +### Unique Transaction Hash per UID + +```{python} +#| label: nunique-transactions-per-uid +qaUids.unique_transactions() +``` + +## Transaction + +### Unique Transaction Hash + +```{python} +#| label: nunique-trans-hash +qaTransactions.unique_trans_hash() +``` + +### Unique Date per Transaction Hash + +Each transaction hash should have a single date associated with it + +```{python} +#| label: nunique-dates-per-trans +qaTransactions.unique_dates() +``` + +### Unique Amount per Transaction Hash + +Each transaction hash should have a single transaction amount associated with it + +```{python} +#| label: nunique-trans-amount-per-trans +qaTransactions.unique_trans_amount() +``` + +### Unique Payment Method per Transaction Hash + +Each transaction hash should have a single transaction payment method associated with it. Note, in some circumstances the payment method is missing as the transaction amount was 0. + +```{python} +#| label: nunique-payment-method-per-trans +qaTransactions.unique_payment_method() +``` + +### Unique Payment Channel per Transaction Hash + +Each transaction hash should have a single transaction payment channel with it. Note in some circumstances the payment channel is missing when the transaction amount is 0, or the payment method is wallet or points. + +```{python} +#| label: nunique-payment-channel-per-trans +qaTransactions.unique_payment_channel() +``` + +### Unique Transaction Status per Transaction Hash + +Each transaction hash should have a single unique payment status associated with it. + +```{python} +#| label: nunique-trans-status-per-trans +qaTransactions.unique_trans_status() +``` + +### Unique Error Codes Status per Transaction Hash + +An error code should only be associated with transaction hashes with a failed payment status. + +```{python} +#| label: nunique-errorcodes-per-trans +qaTransactions.unique_error_codes() +``` + +### UIDs with High Device Hash Counts + +```{python} +#| label: uid-max-device-trans-error-counts +qaTransactions.uid_max_device_trans_error_counts() +``` + +### UIDs with High Card Hash Counts + +```{python} +#| label: uid-max-card-trans-error-counts +qaTransactions.uid_max_card_trans_error_counts() +``` + + +### UIDs with High IP Hash Counts + +```{python} +#| label: uid-max-ip-trans-error-counts +qaTransactions.uid_max_ip_trans_error_counts() +``` + +## Card + +### Unique Card Types per Card Hashes + +Each card should have a single card type associated with it. + +```{python} +#| label: nunique-card-types-per-card +qaCards.unique_card_types() +``` + + +### Unique Country Code per Card Hashes + +Each card should have a single country code associated with it. + +```{python} +#| label: nunique-country-codes-per-card +qaCards.unique_country_codes() +``` + +## IP + +### Unique Country Codes per IP Hashes + +Each IP should have a single country code associated with it. + +```{python} +#| label: nunique-country-codes-per-ip +qaIps.unique_country_codes() +``` diff --git a/report/risk_score.qmd b/report/qmarkdown/risk_score.qmd similarity index 57% rename from report/risk_score.qmd rename to report/qmarkdown/risk_score.qmd index d6d7b49..c451600 100644 --- a/report/risk_score.qmd +++ b/report/qmarkdown/risk_score.qmd @@ -1,3 +1,10 @@ +--- +title: "Risk Score" +format: + html: + code-fold: true +jupyter: python3 +--- ```{python} #| label: set-up @@ -15,8 +22,13 @@ from sklearn.ensemble import RandomForestClassifier sys.path.append(os.getcwd()) sys.path.append(os.path.dirname(os.getcwd())) +sys.path.append(os.path.dirname(os.path.dirname(os.path.join(os.getcwd())))) import generator.cons as cons +from report.qmarkdown.utilities.plotting import lift_plot + +data_dir = os.path.join('..', '..', 'data') +data_report_dir = os.path.join(data_dir, 'report') ``` # Data Prep @@ -26,19 +38,19 @@ import generator.cons as cons ```{python} #| label: data-load # load random telecom payments data -user_feat_foath = os.path.join('..', 'data', 'report', 'user_feat_data.csv') +user_feat_foath = os.path.join(data_report_dir, 'user_feat_data.csv') feat_data = pd.read_csv(user_feat_foath) # load component data -user_comp_data_fpath=os.path.join('..', 'data', 'report', 'user_comp_data.csv') +user_comp_data_fpath=os.path.join(data_report_dir, 'user_comp_data.csv') user_comp_data = pd.read_csv(user_comp_data_fpath) # load customer value score -customer_value_score_foath = os.path.join('..', 'data', 'report', 'customer_value_score.csv') +customer_value_score_foath = os.path.join(data_report_dir, 'customer_value_score.csv') customer_value_score = pd.read_csv(customer_value_score_foath) # load white list users -white_list_users_fpath=os.path.join('..', 'data', 'report', 'white_list_user_data.csv') +white_list_users_fpath=os.path.join(data_report_dir, 'white_list_user_data.csv') white_list_users = pd.read_csv(white_list_users_fpath) # load anomaly scored data -anomaly_data_users_fpath=os.path.join('..', 'data', 'report', 'user_anomaly_data.csv') +anomaly_data_users_fpath=os.path.join(data_report_dir, 'user_anomaly_data.csv') usecols =['userid', 'transaction_week', 'score', 'anomaly_score', 'black_list_users'] anomaly_data = pd.read_csv(anomaly_data_users_fpath, usecols=usecols) ``` @@ -79,7 +91,7 @@ test_data = model_data.loc[model_data['transaction_week'].isin(range(46, 53)), : #| label: score-data # train isolation forests id_cols = ['userid', 'transaction_week'] -X_cols = ['device_hash_size', 'card_hash_size', 'ip_hash_size', 'successful_size', 'E900_size', 'E901_size', 'E902_size', 'E903_size', 'E904_size', 'n_comps', 'total_comp_size', 'customer_value_score', 'score'] +X_cols = ['device_hash_size', 'card_hash_size', 'ip_hash_size', 'Successful_size', 'E900_size', 'E901_size', 'E902_size', 'E903_size', 'E904_size', 'n_comps', 'total_comp_size', 'customer_value_score', 'score'] y_col = 'black_list_users' X_data = train_data[X_cols] y_data = train_data[y_col] @@ -90,42 +102,9 @@ clf.fit(X_data, y_data) ## Evaluate ```{python} -def liftplot(col, target, data, order = 'default', cut = None): - """ - Plot a bar chart of the target variable against a given predictor - """ - # take deep cut of data for temporary storage - tmp_data = data.copy() - # calculate the avergae rate of taking out a loan - mean_y_yes = tmp_data[target].mean() - # if quantile cutting col - if cut != None: - tmp_data[col] = pd.cut(x = tmp_data[col], bins = cut) - # determine plot order based on bin height - if order == 'default': - plot_order = tmp_data.groupby(col)[target].mean().sort_values(ascending=False).index.values - else: - plot_order = order - # set figure size - plt.figure(figsize=(8, 6)) - # create bar plot - sns.barplot(data = tmp_data, x = col, y = target, estimator = np.mean, errorbar = None, color = 'royalblue', order = plot_order) - # format plot title, ticks and labels - plt.title(f'{col} vs {target}', size = 20) - plt.yticks(size = 15) - plt.xticks(rotation = 45, size = 15) - plt.xlabel(col, size = 18) - plt.ylabel(target, size = 18) - # red line indicates average rate of taking out a loan - plt.axhline(y=mean_y_yes, color = 'red', linestyle = '--', linewidth = 3) - # show and close plot - plt.show() - plt.close() - return 0 - # make valid and test predictions valid_data['predict_proba'] = clf.predict_proba(valid_data[X_cols])[:, 1] valid_data.sort_values(by=['predict_proba']) -liftplot(col='predict_proba', target='black_list_users', data=valid_data, order = None, cut = 10) +lift_plot(col='predict_proba', target='black_list_users', data=valid_data, order = None, cut = 10) ``` \ No newline at end of file diff --git a/report/qmarkdown/utilities/anomaly_score.py b/report/qmarkdown/utilities/anomaly_score.py new file mode 100644 index 0000000..e4f5811 --- /dev/null +++ b/report/qmarkdown/utilities/anomaly_score.py @@ -0,0 +1,45 @@ +from sklearn.ensemble import IsolationForest +import pickle + +def gen_anomaly_score(group, model_fpath): + """ + """ + # initiate and load isolation forest model + model = IsolationForestsModel() + model = model.read(model_fpath) + # split data + id_cols = ['userid', 'transaction_week'] + X_cols = ['E901_size', 'E901_sum', 'E902_size', 'E902_sum', 'n_comps', 'total_comp_size'] + train_group = group[X_cols] + score_group = group[id_cols+X_cols] + # train isolation forests and score data + model = model.fit(train_group) + score_group['score'] = model.decision_function(train_group) + # write model to disk + model.write(model_fpath) + return score_group + +def gen_cumulative_anomaly_score(group): + """ + """ + group_sort = group.sort_values('transaction_week') + group_sort['anomaly_score'] = group_sort['score'].cumsum() + return group_sort + +class IsolationForestsModel(): + def __init__(self, n_estimators=20, random_state=None, warm_start=False, n_jobs=None): + self.model = IsolationForest(n_estimators=n_estimators, random_state=random_state, warm_start=warm_start) + self.n_estimators = n_estimators + def fit(self, X): + if self.model.warm_start: + self.model.n_estimators += self.n_estimators + self.model = self.model.fit(X) + return self + def decision_function(self, X): + return self.model.decision_function(X) + def write(self, model_fpath): + with open(model_fpath,'wb') as f: + pickle.dump(self,f) + def read(self, model_fpath): + with open(model_fpath, 'rb') as f: + return pickle.load(f) diff --git a/report/qmarkdown/utilities/feature_engineer.py b/report/qmarkdown/utilities/feature_engineer.py new file mode 100644 index 0000000..48fe9cf --- /dev/null +++ b/report/qmarkdown/utilities/feature_engineer.py @@ -0,0 +1,25 @@ +import pandas as pd + +def feature_engineer(data, ids, groups, target, func): + """ + """ + # aggregate across the ids and group, applying the function to the target + data_agg = data.copy().groupby(by=ids+groups, as_index=False).agg({target:func}) + # pivot the target results across each group + data_pivot = pd.pivot_table(data=data_agg, index=ids, values=target, columns=groups) + # rename and format the columns + data_pivot.columns = data_pivot.columns.str.split(':').str[0] + f'_{func}' + data_pivot = data_pivot.reset_index() + data_pivot.columns.name = None + return data_pivot + +def merge_features(feat_objects): + """ + """ + feat_data = pd.DataFrame(columns=['userid', 'transaction_week']) + # join objects + for feat_object in feat_objects: + feat_data = pd.merge(left=feat_data, right=feat_object, how='outer', on=['userid', 'transaction_week']) + # fill for missing values + feat_data = feat_data.fillna(0) + return feat_data \ No newline at end of file diff --git a/report/qmarkdown/utilities/networks.py b/report/qmarkdown/utilities/networks.py new file mode 100644 index 0000000..fec4283 --- /dev/null +++ b/report/qmarkdown/utilities/networks.py @@ -0,0 +1,46 @@ +import pandas as pd +import networkx as nx + +def gen_base_network_data(entity_networks, user_cols=['userid_x','userid_y']): + """ + """ + # create a base data of users from all entity networks + base_data = pd.concat(objs=[df[user_cols] for df in entity_networks], ignore_index=True, axis=0).drop_duplicates().reset_index(drop=True) + # generate full base data by joining on all entity networks + for entity_network in entity_networks: + base_data = pd.merge(left=base_data, right=entity_network, on=user_cols, how='left') + return base_data + +def gen_entity_network_data(data, entity, userid = 'userid', trans_week = 'transaction_week'): + """ + """ + # extract out the unique user ids and device hashes + user_entity_data = data[[userid, entity, trans_week]].dropna().drop_duplicates() + # inner join users to users based on shared device hash + user_entity_network_data = pd.merge(left = user_entity_data, right = user_entity_data, on = [entity, trans_week], how = 'inner') + # drop rows where userid_x = userid_y + user_entity_network_data = user_entity_network_data.loc[user_entity_network_data[f'{userid}_x'] != user_entity_network_data[f'{userid}_y'], :] + # set col order + col_order = [f'{userid}_x', f'{userid}_y', entity, trans_week] + user_entity_network_data = user_entity_network_data[col_order] + return user_entity_data, user_entity_network_data + +def gen_comp_data(network_data, entity_data, edge_attr): + """ + """ + # apply graphs for each week + trans_week_graphs = network_data.groupby(by='transaction_week').apply(lambda group: nx.from_pandas_edgelist(df = group, source = 'userid_x', target = 'userid_y', edge_attr = [edge_attr])).rename('G').reset_index() + # extract connected components for each week + trans_week_comps = trans_week_graphs.apply(lambda series: pd.DataFrame([{'transaction_week':series['transaction_week'], 'comp_id':i, 'userid':cc} for i, cc in enumerate(nx.connected_components(series['G']))]).explode('userid').reset_index(drop = True), axis=1).to_list() + trans_week_comps = pd.concat(trans_week_comps, axis=0) + # calculate comp id sizes across each week + trans_week_comps_size = trans_week_comps.groupby(by = ['transaction_week', 'comp_id'], as_index = False).agg({'userid':'nunique'}).rename(columns={'userid':'comp_size'}) + # generate the component data + comp_data = pd.merge(left = entity_data, right = trans_week_comps, left_on = ['transaction_week', 'userid'], right_on = ['transaction_week', 'userid'], how = 'inner') + comp_data = pd.merge(left = comp_data, right = trans_week_comps_size, on = ['transaction_week', 'comp_id'], how = 'inner') + # order by comp size + comp_data = comp_data.sort_values(by = ['transaction_week', 'comp_id', 'userid', edge_attr]).reset_index(drop=True) + # normalise data with respect to edge attribute + comp_data = comp_data.rename(columns={edge_attr:'idhashes'}) + comp_data['type'] = edge_attr + return comp_data \ No newline at end of file diff --git a/report/qmarkdown/utilities/plotting.py b/report/qmarkdown/utilities/plotting.py new file mode 100644 index 0000000..8cc26ee --- /dev/null +++ b/report/qmarkdown/utilities/plotting.py @@ -0,0 +1,51 @@ + +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns + +def distribution_plot(data, bins='auto', title=None): + """ + """ + # set plot size and style + sns.set(rc={'figure.figsize':(7, 7), "lines.linewidth": 0.7}) + sns.set_style("white") + dist_plot = sns.histplot(data=data, bins=bins) + if title != None: + dist_plot.set_title(label=title) + plt.show() + plt.close() + + +def lift_plot(col, target, data, order = 'default', cut = None): + """ + Plot a bar chart of the target variable against a given predictor + """ + # take deep cut of data for temporary storage + tmp_data = data.copy() + # calculate the avergae rate of taking out a loan + mean_y_yes = tmp_data[target].mean() + # if quantile cutting col + if cut != None: + tmp_data[col] = pd.cut(x = tmp_data[col], bins = cut) + # determine plot order based on bin height + if order == 'default': + plot_order = tmp_data.groupby(col)[target].mean().sort_values(ascending=False).index.values + else: + plot_order = order + # set figure size + plt.figure(figsize=(8, 6)) + # create bar plot + sns.barplot(data = tmp_data, x = col, y = target, estimator = np.mean, errorbar = None, color = 'royalblue', order = plot_order) + # format plot title, ticks and labels + plt.title(f'{col} vs {target}', size = 20) + plt.yticks(size = 15) + plt.xticks(rotation = 45, size = 15) + plt.xlabel(col, size = 18) + plt.ylabel(target, size = 18) + # red line indicates average rate of taking out a loan + plt.axhline(y=mean_y_yes, color = 'red', linestyle = '--', linewidth = 3) + # show and close plot + plt.show() + plt.close() + return 0 \ No newline at end of file diff --git a/report/qmarkdown/utilities/value_score.py b/report/qmarkdown/utilities/value_score.py new file mode 100644 index 0000000..472d2cb --- /dev/null +++ b/report/qmarkdown/utilities/value_score.py @@ -0,0 +1,22 @@ +def week_pct_score(group, score_cols = ['Successful_size','Successful_sum']): + """ + """ + # percentile rank the score columns + group_score = group[score_cols].rank(method='average', ascending=True, pct=True, axis=0) + group_score.columns = group_score.columns + '_pct' + return group_score + +def gen_weekly_user_scores(group, score_cols=['Successful_size_pct', 'Successful_sum_pct']): + """ + """ + # calculate the customer value score + customer_value_score = group[score_cols].mean(axis=1).rename('customer_value_score') + return customer_value_score + +def apply_cum_sum(group, score_col='customer_value_score'): + """ + """ + # sort and apply cumsum + group_sort = group.sort_values(by='transaction_week') + customer_value_score_cum_sum = group_sort[score_col].cumsum().rename('customer_value_score_cum_sum') + return customer_value_score_cum_sum \ No newline at end of file diff --git a/report/qmarkdown/utilities/white_list.py b/report/qmarkdown/utilities/white_list.py new file mode 100644 index 0000000..bd212cc --- /dev/null +++ b/report/qmarkdown/utilities/white_list.py @@ -0,0 +1,7 @@ +def gen_white_list(group): + # create the high value user identifier + sub_cols = ['userid', 'transaction_week', 'Successful_size', 'Successful_sum', 'E901_size', 'E901_sum', 'customer_value_score_cum_sum'] + high_value_data = group.loc[:, sub_cols].copy() + high_value_data['customer_value_score_cum_sum_pct'] = high_value_data['customer_value_score_cum_sum'].rank(method='average', ascending=True, pct=True, axis=0) + high_value_data['high_value_user'] = (high_value_data['customer_value_score_cum_sum_pct'] >= 0.9).astype(int) + return high_value_data \ No newline at end of file diff --git a/report/white_list_users.qmd b/report/qmarkdown/white_list_users.qmd similarity index 62% rename from report/white_list_users.qmd rename to report/qmarkdown/white_list_users.qmd index a97076e..e83bf16 100644 --- a/report/white_list_users.qmd +++ b/report/qmarkdown/white_list_users.qmd @@ -1,3 +1,11 @@ +--- +title: "White List Users" +format: + html: + code-fold: true +jupyter: python3 +--- + ```{python} #| label: set-up import os @@ -8,8 +16,13 @@ import sys sys.path.append(os.getcwd()) sys.path.append(os.path.dirname(os.getcwd())) +sys.path.append(os.path.dirname(os.path.dirname(os.path.join(os.getcwd())))) import generator.cons as cons +from report.qmarkdown.utilities.white_list import gen_white_list + +data_dir = os.path.join('..', '..', 'data') +data_report_dir = os.path.join(data_dir, 'report') ``` # Data Load @@ -18,10 +31,10 @@ import generator.cons as cons #| label: data-load # load user feature data -user_feat_foath = os.path.join('..', 'data', 'report', 'user_feat_data.csv') +user_feat_foath = os.path.join(data_report_dir, 'user_feat_data.csv') user_feat_data = pd.read_csv(user_feat_foath) # load customer value score data -customer_value_score_foath = os.path.join('..', 'data', 'report', 'customer_value_score.csv') +customer_value_score_foath = os.path.join(data_report_dir, 'customer_value_score.csv') customer_value_score = pd.read_csv(customer_value_score_foath) # create the base data by joining user feature data and customer value score base_data = pd.merge(left=user_feat_data, right=customer_value_score, on=['userid', 'transaction_week'], how='inner') @@ -36,14 +49,6 @@ Identify the high value users as the customers with a customer value score in th ```{python} #| label: high-value-users -def gen_white_list(group): - # create the high value user identifier - sub_cols = ['userid', 'transaction_week', 'successful_size', 'successful_sum', 'E901_size', 'E901_sum', 'customer_value_score_cumsum'] - high_value_data = group.loc[:, sub_cols].copy() - high_value_data['customer_value_score_cumsum_pct'] = high_value_data['customer_value_score_cumsum'].rank(method='average', ascending=True, pct=True, axis=0) - high_value_data['high_value_user'] = (high_value_data['customer_value_score_cumsum_pct'] >= 0.9).astype(int) - return high_value_data - # identify white list users for each week high_value_data = base_data.groupby(by='transaction_week', group_keys=False, as_index=False).apply(lambda group: gen_white_list(group)) ``` @@ -53,10 +58,10 @@ high_value_data = base_data.groupby(by='transaction_week', group_keys=False, as_ ```{python} # sum across the groupby_col=['high_value_user'] -agg_dict={'successful_size':'sum', 'successful_sum':'sum', 'customer_value_score_cumsum':'mean', 'customer_value_score_cumsum_pct':'mean'} +agg_dict={'Successful_size':'sum', 'Successful_sum':'sum', 'customer_value_score_cum_sum':'mean', 'customer_value_score_cum_sum_pct':'mean'} high_value_data_agg = high_value_data.groupby(by=groupby_col).agg(agg_dict) # add total row -total_agg={'successful_size':'sum', 'successful_sum':'sum', 'customer_value_score_cumsum':'mean', 'customer_value_score_cumsum_pct':'mean'} +total_agg={'Successful_size':'sum', 'Successful_sum':'sum', 'customer_value_score_cum_sum':'mean', 'customer_value_score_cum_sum_pct':'mean'} total_value_data_agg = high_value_data.agg(agg_dict).rename('total').to_frame().T high_value_data_agg = pd.concat(objs=[high_value_data_agg, total_value_data_agg], axis=0) high_value_data_agg.head() @@ -71,7 +76,7 @@ high_value_data['hasE901'] = (high_value_data['E901_size'] > 1).astype(int) # identiy any high value users with no suspected fraud transaction errors high_value_data['white_list_users'] = ((high_value_data['hasE901'] == 0) & (high_value_data['high_value_user'] == 1)).astype(int) # subset out white list users -sub_cols = ['userid', 'customer_value_score_cumsum_pct', 'transaction_week', 'high_value_user', 'hasE901', 'white_list_users'] +sub_cols = ['userid', 'customer_value_score_cum_sum_pct', 'transaction_week', 'high_value_user', 'hasE901', 'white_list_users'] white_list_users = high_value_data[sub_cols] ``` @@ -80,6 +85,6 @@ white_list_users = high_value_data[sub_cols] ```{python} #| label: write-data # save white list users to disk -white_list_users_fpath=os.path.join('..', 'data', 'report', 'white_list_user_data.csv') +white_list_users_fpath=os.path.join(data_report_dir, 'white_list_user_data.csv') white_list_users.to_csv(white_list_users_fpath, index=False) ``` diff --git a/requirements.txt b/requirements.txt index ab8a1d7..0a184ee 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,7 @@ beartype==0.19.0 unidecode==1.3.8 boto3==1.36.12 fastapi[standard]==0.128.0 -pillow==12.1.1 \ No newline at end of file +nbformat==5.10.4 +nbclient==0.10.4 +tabulate==0.9.0 +pillow==12.1.1 diff --git a/uv.lock b/uv.lock index 1197834..466b68d 100644 --- a/uv.lock +++ b/uv.lock @@ -51,6 +51,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d2/39/e7eaf1799466a4aef85b6a4fe7bd175ad2b1c6345066aa33f1f58d4b18d0/asttokens-3.0.1-py3-none-any.whl", hash = "sha256:15a3ebc0f43c2d0a50eeafea25e19046c68398e487b9f1f5b517f7c0f40f976a", size = 27047, upload-time = "2025-11-15T16:43:16.109Z" }, ] +[[package]] +name = "attrs" +version = "25.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6b/5c/685e6633917e101e5dcb62b9dd76946cbb57c26e133bae9e0cd36033c0a9/attrs-25.4.0.tar.gz", hash = "sha256:16d5969b87f0859ef33a48b35d55ac1be6e42ae49d5e853b597db70c35c57e11", size = 934251, upload-time = "2025-10-06T13:54:44.725Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3a/2a/7cc015f5b9f5db42b7d48157e23356022889fc354a2813c15934b7cb5c0e/attrs-25.4.0-py3-none-any.whl", hash = "sha256:adcf7e2a1fb3b36ac48d97835bb6d8ade15b8dcce26aba8bf1d14847b57a3373", size = 67615, upload-time = "2025-10-06T13:54:43.17Z" }, +] + [[package]] name = "beartype" version = "0.19.0" @@ -531,6 +540,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/85/11/0aa8455af26f0ae89e42be67f3a874255ee5d7f0f026fc86e8d56f76b428/fastar-0.8.0-cp314-cp314t-win_arm64.whl", hash = "sha256:e59673307b6a08210987059a2bdea2614fe26e3335d0e5d1a3d95f49a05b1418", size = 460467, upload-time = "2025-11-26T02:36:07.978Z" }, ] +[[package]] +name = "fastjsonschema" +version = "2.21.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/20/b5/23b216d9d985a956623b6bd12d4086b60f0059b27799f23016af04a74ea1/fastjsonschema-2.21.2.tar.gz", hash = "sha256:b1eb43748041c880796cd077f1a07c3d94e93ae84bba5ed36800a33554ae05de", size = 374130, upload-time = "2025-08-14T18:49:36.666Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cb/a8/20d0723294217e47de6d9e2e40fd4a9d2f7c4b6ef974babd482a59743694/fastjsonschema-2.21.2-py3-none-any.whl", hash = "sha256:1c797122d0a86c5cace2e54bf4e819c36223b552017172f32c5c024a6b77e463", size = 24024, upload-time = "2025-08-14T18:49:34.776Z" }, +] + [[package]] name = "fastparquet" version = "2024.11.0" @@ -786,6 +804,33 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl", hash = "sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713", size = 309071, upload-time = "2025-12-15T08:41:44.973Z" }, ] +[[package]] +name = "jsonschema" +version = "4.26.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, + { name = "jsonschema-specifications" }, + { name = "referencing" }, + { name = "rpds-py" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b3/fc/e067678238fa451312d4c62bf6e6cf5ec56375422aee02f9cb5f909b3047/jsonschema-4.26.0.tar.gz", hash = "sha256:0c26707e2efad8aa1bfc5b7ce170f3fccc2e4918ff85989ba9ffa9facb2be326", size = 366583, upload-time = "2026-01-07T13:41:07.246Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/69/90/f63fb5873511e014207a475e2bb4e8b2e570d655b00ac19a9a0ca0a385ee/jsonschema-4.26.0-py3-none-any.whl", hash = "sha256:d489f15263b8d200f8387e64b4c3a75f06629559fb73deb8fdfb525f2dab50ce", size = 90630, upload-time = "2026-01-07T13:41:05.306Z" }, +] + +[[package]] +name = "jsonschema-specifications" +version = "2025.9.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "referencing" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/19/74/a633ee74eb36c44aa6d1095e7cc5569bebf04342ee146178e2d36600708b/jsonschema_specifications-2025.9.1.tar.gz", hash = "sha256:b540987f239e745613c7a9176f3edb72b832a4ac465cf02712288397832b5e8d", size = 32855, upload-time = "2025-09-08T01:34:59.186Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" }, +] + [[package]] name = "jupyter-client" version = "8.8.0" @@ -1057,6 +1102,36 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, ] +[[package]] +name = "nbclient" +version = "0.10.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "jupyter-client" }, + { name = "jupyter-core" }, + { name = "nbformat" }, + { name = "traitlets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/56/91/1c1d5a4b9a9ebba2b4e32b8c852c2975c872aec1fe42ab5e516b2cecd193/nbclient-0.10.4.tar.gz", hash = "sha256:1e54091b16e6da39e297b0ece3e10f6f29f4ac4e8ee515d29f8a7099bd6553c9", size = 62554, upload-time = "2025-12-23T07:45:46.369Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/83/a0/5b0c2f11142ed1dddec842457d3f65eaf71a0080894eb6f018755b319c3a/nbclient-0.10.4-py3-none-any.whl", hash = "sha256:9162df5a7373d70d606527300a95a975a47c137776cd942e52d9c7e29ff83440", size = 25465, upload-time = "2025-12-23T07:45:44.51Z" }, +] + +[[package]] +name = "nbformat" +version = "5.10.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "fastjsonschema" }, + { name = "jsonschema" }, + { name = "jupyter-core" }, + { name = "traitlets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6d/fd/91545e604bc3dad7dca9ed03284086039b294c6b3d75c0d2fa45f9e9caf3/nbformat-5.10.4.tar.gz", hash = "sha256:322168b14f937a5d11362988ecac2a4952d3d8e3a2cbeb2319584631226d5b3a", size = 142749, upload-time = "2024-04-04T11:20:37.371Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a9/82/0340caa499416c78e5d8f5f05947ae4bc3cba53c9f038ab6e9ed964e22f1/nbformat-5.10.4-py3-none-any.whl", hash = "sha256:3b48d6c8fbca4b299bf3982ea7db1af21580e4fec269ad087b9e81588891200b", size = 78454, upload-time = "2024-04-04T11:20:34.895Z" }, +] + [[package]] name = "nest-asyncio" version = "1.6.0" @@ -1628,6 +1703,8 @@ dependencies = [ { name = "fastapi", extra = ["standard"] }, { name = "fastparquet" }, { name = "ipykernel" }, + { name = "nbclient" }, + { name = "nbformat" }, { name = "networkx" }, { name = "numpy" }, { name = "pandas" }, @@ -1637,6 +1714,7 @@ dependencies = [ { name = "scipy" }, { name = "seaborn" }, { name = "shap" }, + { name = "tabulate" }, { name = "unidecode" }, ] @@ -1647,6 +1725,8 @@ requires-dist = [ { name = "fastapi", extras = ["standard"], specifier = "==0.128.0" }, { name = "fastparquet", specifier = "==2024.11.0" }, { name = "ipykernel", specifier = "==6.29.5" }, + { name = "nbclient", specifier = "==0.10.4" }, + { name = "nbformat", specifier = "==5.10.4" }, { name = "networkx", specifier = "==3.4.2" }, { name = "numpy", specifier = "==2.0.2" }, { name = "pandas", specifier = "==2.2.3" }, @@ -1656,9 +1736,24 @@ requires-dist = [ { name = "scipy", specifier = "==1.15.1" }, { name = "seaborn", specifier = "==0.13.2" }, { name = "shap", specifier = "==0.46.0" }, + { name = "tabulate", specifier = "==0.9.0" }, { name = "unidecode", specifier = "==1.3.8" }, ] +[[package]] +name = "referencing" +version = "0.37.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, + { name = "rpds-py" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/22/f5/df4e9027acead3ecc63e50fe1e36aca1523e1719559c499951bb4b53188f/referencing-0.37.0.tar.gz", hash = "sha256:44aefc3142c5b842538163acb373e24cce6632bd54bdb01b21ad5863489f50d8", size = 78036, upload-time = "2025-10-13T15:30:48.871Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2c/58/ca301544e1fa93ed4f80d724bf5b194f6e4b945841c5bfd555878eea9fcb/referencing-0.37.0-py3-none-any.whl", hash = "sha256:381329a9f99628c9069361716891d34ad94af76e461dcb0335825aecc7692231", size = 26766, upload-time = "2025-10-13T15:30:47.625Z" }, +] + [[package]] name = "rich" version = "14.3.1" @@ -1754,6 +1849,87 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/79/62/b88e5879512c55b8ee979c666ee6902adc4ed05007226de266410ae27965/rignore-0.7.6-cp314-cp314t-win_arm64.whl", hash = "sha256:b83adabeb3e8cf662cabe1931b83e165b88c526fa6af6b3aa90429686e474896", size = 656035, upload-time = "2025-11-05T21:41:31.13Z" }, ] +[[package]] +name = "rpds-py" +version = "0.30.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/20/af/3f2f423103f1113b36230496629986e0ef7e199d2aa8392452b484b38ced/rpds_py-0.30.0.tar.gz", hash = "sha256:dd8ff7cf90014af0c0f787eea34794ebf6415242ee1d6fa91eaba725cc441e84", size = 69469, upload-time = "2025-11-30T20:24:38.837Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/03/e7/98a2f4ac921d82f33e03f3835f5bf3a4a40aa1bfdc57975e74a97b2b4bdd/rpds_py-0.30.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a161f20d9a43006833cd7068375a94d035714d73a172b681d8881820600abfad", size = 375086, upload-time = "2025-11-30T20:22:17.93Z" }, + { url = "https://files.pythonhosted.org/packages/4d/a1/bca7fd3d452b272e13335db8d6b0b3ecde0f90ad6f16f3328c6fb150c889/rpds_py-0.30.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6abc8880d9d036ecaafe709079969f56e876fcf107f7a8e9920ba6d5a3878d05", size = 359053, upload-time = "2025-11-30T20:22:19.297Z" }, + { url = "https://files.pythonhosted.org/packages/65/1c/ae157e83a6357eceff62ba7e52113e3ec4834a84cfe07fa4b0757a7d105f/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca28829ae5f5d569bb62a79512c842a03a12576375d5ece7d2cadf8abe96ec28", size = 390763, upload-time = "2025-11-30T20:22:21.661Z" }, + { url = "https://files.pythonhosted.org/packages/d4/36/eb2eb8515e2ad24c0bd43c3ee9cd74c33f7ca6430755ccdb240fd3144c44/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a1010ed9524c73b94d15919ca4d41d8780980e1765babf85f9a2f90d247153dd", size = 408951, upload-time = "2025-11-30T20:22:23.408Z" }, + { url = "https://files.pythonhosted.org/packages/d6/65/ad8dc1784a331fabbd740ef6f71ce2198c7ed0890dab595adb9ea2d775a1/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f8d1736cfb49381ba528cd5baa46f82fdc65c06e843dab24dd70b63d09121b3f", size = 514622, upload-time = "2025-11-30T20:22:25.16Z" }, + { url = "https://files.pythonhosted.org/packages/63/8e/0cfa7ae158e15e143fe03993b5bcd743a59f541f5952e1546b1ac1b5fd45/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d948b135c4693daff7bc2dcfc4ec57237a29bd37e60c2fabf5aff2bbacf3e2f1", size = 414492, upload-time = "2025-11-30T20:22:26.505Z" }, + { url = "https://files.pythonhosted.org/packages/60/1b/6f8f29f3f995c7ffdde46a626ddccd7c63aefc0efae881dc13b6e5d5bb16/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47f236970bccb2233267d89173d3ad2703cd36a0e2a6e92d0560d333871a3d23", size = 394080, upload-time = "2025-11-30T20:22:27.934Z" }, + { url = "https://files.pythonhosted.org/packages/6d/d5/a266341051a7a3ca2f4b750a3aa4abc986378431fc2da508c5034d081b70/rpds_py-0.30.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:2e6ecb5a5bcacf59c3f912155044479af1d0b6681280048b338b28e364aca1f6", size = 408680, upload-time = "2025-11-30T20:22:29.341Z" }, + { url = "https://files.pythonhosted.org/packages/10/3b/71b725851df9ab7a7a4e33cf36d241933da66040d195a84781f49c50490c/rpds_py-0.30.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a8fa71a2e078c527c3e9dc9fc5a98c9db40bcc8a92b4e8858e36d329f8684b51", size = 423589, upload-time = "2025-11-30T20:22:31.469Z" }, + { url = "https://files.pythonhosted.org/packages/00/2b/e59e58c544dc9bd8bd8384ecdb8ea91f6727f0e37a7131baeff8d6f51661/rpds_py-0.30.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:73c67f2db7bc334e518d097c6d1e6fed021bbc9b7d678d6cc433478365d1d5f5", size = 573289, upload-time = "2025-11-30T20:22:32.997Z" }, + { url = "https://files.pythonhosted.org/packages/da/3e/a18e6f5b460893172a7d6a680e86d3b6bc87a54c1f0b03446a3c8c7b588f/rpds_py-0.30.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:5ba103fb455be00f3b1c2076c9d4264bfcb037c976167a6047ed82f23153f02e", size = 599737, upload-time = "2025-11-30T20:22:34.419Z" }, + { url = "https://files.pythonhosted.org/packages/5c/e2/714694e4b87b85a18e2c243614974413c60aa107fd815b8cbc42b873d1d7/rpds_py-0.30.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7cee9c752c0364588353e627da8a7e808a66873672bcb5f52890c33fd965b394", size = 563120, upload-time = "2025-11-30T20:22:35.903Z" }, + { url = "https://files.pythonhosted.org/packages/6f/ab/d5d5e3bcedb0a77f4f613706b750e50a5a3ba1c15ccd3665ecc636c968fd/rpds_py-0.30.0-cp312-cp312-win32.whl", hash = "sha256:1ab5b83dbcf55acc8b08fc62b796ef672c457b17dbd7820a11d6c52c06839bdf", size = 223782, upload-time = "2025-11-30T20:22:37.271Z" }, + { url = "https://files.pythonhosted.org/packages/39/3b/f786af9957306fdc38a74cef405b7b93180f481fb48453a114bb6465744a/rpds_py-0.30.0-cp312-cp312-win_amd64.whl", hash = "sha256:a090322ca841abd453d43456ac34db46e8b05fd9b3b4ac0c78bcde8b089f959b", size = 240463, upload-time = "2025-11-30T20:22:39.021Z" }, + { url = "https://files.pythonhosted.org/packages/f3/d2/b91dc748126c1559042cfe41990deb92c4ee3e2b415f6b5234969ffaf0cc/rpds_py-0.30.0-cp312-cp312-win_arm64.whl", hash = "sha256:669b1805bd639dd2989b281be2cfd951c6121b65e729d9b843e9639ef1fd555e", size = 230868, upload-time = "2025-11-30T20:22:40.493Z" }, + { url = "https://files.pythonhosted.org/packages/ed/dc/d61221eb88ff410de3c49143407f6f3147acf2538c86f2ab7ce65ae7d5f9/rpds_py-0.30.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:f83424d738204d9770830d35290ff3273fbb02b41f919870479fab14b9d303b2", size = 374887, upload-time = "2025-11-30T20:22:41.812Z" }, + { url = "https://files.pythonhosted.org/packages/fd/32/55fb50ae104061dbc564ef15cc43c013dc4a9f4527a1f4d99baddf56fe5f/rpds_py-0.30.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e7536cd91353c5273434b4e003cbda89034d67e7710eab8761fd918ec6c69cf8", size = 358904, upload-time = "2025-11-30T20:22:43.479Z" }, + { url = "https://files.pythonhosted.org/packages/58/70/faed8186300e3b9bdd138d0273109784eea2396c68458ed580f885dfe7ad/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2771c6c15973347f50fece41fc447c054b7ac2ae0502388ce3b6738cd366e3d4", size = 389945, upload-time = "2025-11-30T20:22:44.819Z" }, + { url = "https://files.pythonhosted.org/packages/bd/a8/073cac3ed2c6387df38f71296d002ab43496a96b92c823e76f46b8af0543/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0a59119fc6e3f460315fe9d08149f8102aa322299deaa5cab5b40092345c2136", size = 407783, upload-time = "2025-11-30T20:22:46.103Z" }, + { url = "https://files.pythonhosted.org/packages/77/57/5999eb8c58671f1c11eba084115e77a8899d6e694d2a18f69f0ba471ec8b/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:76fec018282b4ead0364022e3c54b60bf368b9d926877957a8624b58419169b7", size = 515021, upload-time = "2025-11-30T20:22:47.458Z" }, + { url = "https://files.pythonhosted.org/packages/e0/af/5ab4833eadc36c0a8ed2bc5c0de0493c04f6c06de223170bd0798ff98ced/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:692bef75a5525db97318e8cd061542b5a79812d711ea03dbc1f6f8dbb0c5f0d2", size = 414589, upload-time = "2025-11-30T20:22:48.872Z" }, + { url = "https://files.pythonhosted.org/packages/b7/de/f7192e12b21b9e9a68a6d0f249b4af3fdcdff8418be0767a627564afa1f1/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9027da1ce107104c50c81383cae773ef5c24d296dd11c99e2629dbd7967a20c6", size = 394025, upload-time = "2025-11-30T20:22:50.196Z" }, + { url = "https://files.pythonhosted.org/packages/91/c4/fc70cd0249496493500e7cc2de87504f5aa6509de1e88623431fec76d4b6/rpds_py-0.30.0-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:9cf69cdda1f5968a30a359aba2f7f9aa648a9ce4b580d6826437f2b291cfc86e", size = 408895, upload-time = "2025-11-30T20:22:51.87Z" }, + { url = "https://files.pythonhosted.org/packages/58/95/d9275b05ab96556fefff73a385813eb66032e4c99f411d0795372d9abcea/rpds_py-0.30.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a4796a717bf12b9da9d3ad002519a86063dcac8988b030e405704ef7d74d2d9d", size = 422799, upload-time = "2025-11-30T20:22:53.341Z" }, + { url = "https://files.pythonhosted.org/packages/06/c1/3088fc04b6624eb12a57eb814f0d4997a44b0d208d6cace713033ff1a6ba/rpds_py-0.30.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:5d4c2aa7c50ad4728a094ebd5eb46c452e9cb7edbfdb18f9e1221f597a73e1e7", size = 572731, upload-time = "2025-11-30T20:22:54.778Z" }, + { url = "https://files.pythonhosted.org/packages/d8/42/c612a833183b39774e8ac8fecae81263a68b9583ee343db33ab571a7ce55/rpds_py-0.30.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ba81a9203d07805435eb06f536d95a266c21e5b2dfbf6517748ca40c98d19e31", size = 599027, upload-time = "2025-11-30T20:22:56.212Z" }, + { url = "https://files.pythonhosted.org/packages/5f/60/525a50f45b01d70005403ae0e25f43c0384369ad24ffe46e8d9068b50086/rpds_py-0.30.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:945dccface01af02675628334f7cf49c2af4c1c904748efc5cf7bbdf0b579f95", size = 563020, upload-time = "2025-11-30T20:22:58.2Z" }, + { url = "https://files.pythonhosted.org/packages/0b/5d/47c4655e9bcd5ca907148535c10e7d489044243cc9941c16ed7cd53be91d/rpds_py-0.30.0-cp313-cp313-win32.whl", hash = "sha256:b40fb160a2db369a194cb27943582b38f79fc4887291417685f3ad693c5a1d5d", size = 223139, upload-time = "2025-11-30T20:23:00.209Z" }, + { url = "https://files.pythonhosted.org/packages/f2/e1/485132437d20aa4d3e1d8b3fb5a5e65aa8139f1e097080c2a8443201742c/rpds_py-0.30.0-cp313-cp313-win_amd64.whl", hash = "sha256:806f36b1b605e2d6a72716f321f20036b9489d29c51c91f4dd29a3e3afb73b15", size = 240224, upload-time = "2025-11-30T20:23:02.008Z" }, + { url = "https://files.pythonhosted.org/packages/24/95/ffd128ed1146a153d928617b0ef673960130be0009c77d8fbf0abe306713/rpds_py-0.30.0-cp313-cp313-win_arm64.whl", hash = "sha256:d96c2086587c7c30d44f31f42eae4eac89b60dabbac18c7669be3700f13c3ce1", size = 230645, upload-time = "2025-11-30T20:23:03.43Z" }, + { url = "https://files.pythonhosted.org/packages/ff/1b/b10de890a0def2a319a2626334a7f0ae388215eb60914dbac8a3bae54435/rpds_py-0.30.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:eb0b93f2e5c2189ee831ee43f156ed34e2a89a78a66b98cadad955972548be5a", size = 364443, upload-time = "2025-11-30T20:23:04.878Z" }, + { url = "https://files.pythonhosted.org/packages/0d/bf/27e39f5971dc4f305a4fb9c672ca06f290f7c4e261c568f3dea16a410d47/rpds_py-0.30.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:922e10f31f303c7c920da8981051ff6d8c1a56207dbdf330d9047f6d30b70e5e", size = 353375, upload-time = "2025-11-30T20:23:06.342Z" }, + { url = "https://files.pythonhosted.org/packages/40/58/442ada3bba6e8e6615fc00483135c14a7538d2ffac30e2d933ccf6852232/rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cdc62c8286ba9bf7f47befdcea13ea0e26bf294bda99758fd90535cbaf408000", size = 383850, upload-time = "2025-11-30T20:23:07.825Z" }, + { url = "https://files.pythonhosted.org/packages/14/14/f59b0127409a33c6ef6f5c1ebd5ad8e32d7861c9c7adfa9a624fc3889f6c/rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:47f9a91efc418b54fb8190a6b4aa7813a23fb79c51f4bb84e418f5476c38b8db", size = 392812, upload-time = "2025-11-30T20:23:09.228Z" }, + { url = "https://files.pythonhosted.org/packages/b3/66/e0be3e162ac299b3a22527e8913767d869e6cc75c46bd844aa43fb81ab62/rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1f3587eb9b17f3789ad50824084fa6f81921bbf9a795826570bda82cb3ed91f2", size = 517841, upload-time = "2025-11-30T20:23:11.186Z" }, + { url = "https://files.pythonhosted.org/packages/3d/55/fa3b9cf31d0c963ecf1ba777f7cf4b2a2c976795ac430d24a1f43d25a6ba/rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:39c02563fc592411c2c61d26b6c5fe1e51eaa44a75aa2c8735ca88b0d9599daa", size = 408149, upload-time = "2025-11-30T20:23:12.864Z" }, + { url = "https://files.pythonhosted.org/packages/60/ca/780cf3b1a32b18c0f05c441958d3758f02544f1d613abf9488cd78876378/rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51a1234d8febafdfd33a42d97da7a43f5dcb120c1060e352a3fbc0c6d36e2083", size = 383843, upload-time = "2025-11-30T20:23:14.638Z" }, + { url = "https://files.pythonhosted.org/packages/82/86/d5f2e04f2aa6247c613da0c1dd87fcd08fa17107e858193566048a1e2f0a/rpds_py-0.30.0-cp313-cp313t-manylinux_2_31_riscv64.whl", hash = "sha256:eb2c4071ab598733724c08221091e8d80e89064cd472819285a9ab0f24bcedb9", size = 396507, upload-time = "2025-11-30T20:23:16.105Z" }, + { url = "https://files.pythonhosted.org/packages/4b/9a/453255d2f769fe44e07ea9785c8347edaf867f7026872e76c1ad9f7bed92/rpds_py-0.30.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6bdfdb946967d816e6adf9a3d8201bfad269c67efe6cefd7093ef959683c8de0", size = 414949, upload-time = "2025-11-30T20:23:17.539Z" }, + { url = "https://files.pythonhosted.org/packages/a3/31/622a86cdc0c45d6df0e9ccb6becdba5074735e7033c20e401a6d9d0e2ca0/rpds_py-0.30.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c77afbd5f5250bf27bf516c7c4a016813eb2d3e116139aed0096940c5982da94", size = 565790, upload-time = "2025-11-30T20:23:19.029Z" }, + { url = "https://files.pythonhosted.org/packages/1c/5d/15bbf0fb4a3f58a3b1c67855ec1efcc4ceaef4e86644665fff03e1b66d8d/rpds_py-0.30.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:61046904275472a76c8c90c9ccee9013d70a6d0f73eecefd38c1ae7c39045a08", size = 590217, upload-time = "2025-11-30T20:23:20.885Z" }, + { url = "https://files.pythonhosted.org/packages/6d/61/21b8c41f68e60c8cc3b2e25644f0e3681926020f11d06ab0b78e3c6bbff1/rpds_py-0.30.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4c5f36a861bc4b7da6516dbdf302c55313afa09b81931e8280361a4f6c9a2d27", size = 555806, upload-time = "2025-11-30T20:23:22.488Z" }, + { url = "https://files.pythonhosted.org/packages/f9/39/7e067bb06c31de48de3eb200f9fc7c58982a4d3db44b07e73963e10d3be9/rpds_py-0.30.0-cp313-cp313t-win32.whl", hash = "sha256:3d4a69de7a3e50ffc214ae16d79d8fbb0922972da0356dcf4d0fdca2878559c6", size = 211341, upload-time = "2025-11-30T20:23:24.449Z" }, + { url = "https://files.pythonhosted.org/packages/0a/4d/222ef0b46443cf4cf46764d9c630f3fe4abaa7245be9417e56e9f52b8f65/rpds_py-0.30.0-cp313-cp313t-win_amd64.whl", hash = "sha256:f14fc5df50a716f7ece6a80b6c78bb35ea2ca47c499e422aa4463455dd96d56d", size = 225768, upload-time = "2025-11-30T20:23:25.908Z" }, + { url = "https://files.pythonhosted.org/packages/86/81/dad16382ebbd3d0e0328776d8fd7ca94220e4fa0798d1dc5e7da48cb3201/rpds_py-0.30.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:68f19c879420aa08f61203801423f6cd5ac5f0ac4ac82a2368a9fcd6a9a075e0", size = 362099, upload-time = "2025-11-30T20:23:27.316Z" }, + { url = "https://files.pythonhosted.org/packages/2b/60/19f7884db5d5603edf3c6bce35408f45ad3e97e10007df0e17dd57af18f8/rpds_py-0.30.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:ec7c4490c672c1a0389d319b3a9cfcd098dcdc4783991553c332a15acf7249be", size = 353192, upload-time = "2025-11-30T20:23:29.151Z" }, + { url = "https://files.pythonhosted.org/packages/bf/c4/76eb0e1e72d1a9c4703c69607cec123c29028bff28ce41588792417098ac/rpds_py-0.30.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f251c812357a3fed308d684a5079ddfb9d933860fc6de89f2b7ab00da481e65f", size = 384080, upload-time = "2025-11-30T20:23:30.785Z" }, + { url = "https://files.pythonhosted.org/packages/72/87/87ea665e92f3298d1b26d78814721dc39ed8d2c74b86e83348d6b48a6f31/rpds_py-0.30.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ac98b175585ecf4c0348fd7b29c3864bda53b805c773cbf7bfdaffc8070c976f", size = 394841, upload-time = "2025-11-30T20:23:32.209Z" }, + { url = "https://files.pythonhosted.org/packages/77/ad/7783a89ca0587c15dcbf139b4a8364a872a25f861bdb88ed99f9b0dec985/rpds_py-0.30.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3e62880792319dbeb7eb866547f2e35973289e7d5696c6e295476448f5b63c87", size = 516670, upload-time = "2025-11-30T20:23:33.742Z" }, + { url = "https://files.pythonhosted.org/packages/5b/3c/2882bdac942bd2172f3da574eab16f309ae10a3925644e969536553cb4ee/rpds_py-0.30.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4e7fc54e0900ab35d041b0601431b0a0eb495f0851a0639b6ef90f7741b39a18", size = 408005, upload-time = "2025-11-30T20:23:35.253Z" }, + { url = "https://files.pythonhosted.org/packages/ce/81/9a91c0111ce1758c92516a3e44776920b579d9a7c09b2b06b642d4de3f0f/rpds_py-0.30.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47e77dc9822d3ad616c3d5759ea5631a75e5809d5a28707744ef79d7a1bcfcad", size = 382112, upload-time = "2025-11-30T20:23:36.842Z" }, + { url = "https://files.pythonhosted.org/packages/cf/8e/1da49d4a107027e5fbc64daeab96a0706361a2918da10cb41769244b805d/rpds_py-0.30.0-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:b4dc1a6ff022ff85ecafef7979a2c6eb423430e05f1165d6688234e62ba99a07", size = 399049, upload-time = "2025-11-30T20:23:38.343Z" }, + { url = "https://files.pythonhosted.org/packages/df/5a/7ee239b1aa48a127570ec03becbb29c9d5a9eb092febbd1699d567cae859/rpds_py-0.30.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4559c972db3a360808309e06a74628b95eaccbf961c335c8fe0d590cf587456f", size = 415661, upload-time = "2025-11-30T20:23:40.263Z" }, + { url = "https://files.pythonhosted.org/packages/70/ea/caa143cf6b772f823bc7929a45da1fa83569ee49b11d18d0ada7f5ee6fd6/rpds_py-0.30.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:0ed177ed9bded28f8deb6ab40c183cd1192aa0de40c12f38be4d59cd33cb5c65", size = 565606, upload-time = "2025-11-30T20:23:42.186Z" }, + { url = "https://files.pythonhosted.org/packages/64/91/ac20ba2d69303f961ad8cf55bf7dbdb4763f627291ba3d0d7d67333cced9/rpds_py-0.30.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:ad1fa8db769b76ea911cb4e10f049d80bf518c104f15b3edb2371cc65375c46f", size = 591126, upload-time = "2025-11-30T20:23:44.086Z" }, + { url = "https://files.pythonhosted.org/packages/21/20/7ff5f3c8b00c8a95f75985128c26ba44503fb35b8e0259d812766ea966c7/rpds_py-0.30.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:46e83c697b1f1c72b50e5ee5adb4353eef7406fb3f2043d64c33f20ad1c2fc53", size = 553371, upload-time = "2025-11-30T20:23:46.004Z" }, + { url = "https://files.pythonhosted.org/packages/72/c7/81dadd7b27c8ee391c132a6b192111ca58d866577ce2d9b0ca157552cce0/rpds_py-0.30.0-cp314-cp314-win32.whl", hash = "sha256:ee454b2a007d57363c2dfd5b6ca4a5d7e2c518938f8ed3b706e37e5d470801ed", size = 215298, upload-time = "2025-11-30T20:23:47.696Z" }, + { url = "https://files.pythonhosted.org/packages/3e/d2/1aaac33287e8cfb07aab2e6b8ac1deca62f6f65411344f1433c55e6f3eb8/rpds_py-0.30.0-cp314-cp314-win_amd64.whl", hash = "sha256:95f0802447ac2d10bcc69f6dc28fe95fdf17940367b21d34e34c737870758950", size = 228604, upload-time = "2025-11-30T20:23:49.501Z" }, + { url = "https://files.pythonhosted.org/packages/e8/95/ab005315818cc519ad074cb7784dae60d939163108bd2b394e60dc7b5461/rpds_py-0.30.0-cp314-cp314-win_arm64.whl", hash = "sha256:613aa4771c99f03346e54c3f038e4cc574ac09a3ddfb0e8878487335e96dead6", size = 222391, upload-time = "2025-11-30T20:23:50.96Z" }, + { url = "https://files.pythonhosted.org/packages/9e/68/154fe0194d83b973cdedcdcc88947a2752411165930182ae41d983dcefa6/rpds_py-0.30.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:7e6ecfcb62edfd632e56983964e6884851786443739dbfe3582947e87274f7cb", size = 364868, upload-time = "2025-11-30T20:23:52.494Z" }, + { url = "https://files.pythonhosted.org/packages/83/69/8bbc8b07ec854d92a8b75668c24d2abcb1719ebf890f5604c61c9369a16f/rpds_py-0.30.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:a1d0bc22a7cdc173fedebb73ef81e07faef93692b8c1ad3733b67e31e1b6e1b8", size = 353747, upload-time = "2025-11-30T20:23:54.036Z" }, + { url = "https://files.pythonhosted.org/packages/ab/00/ba2e50183dbd9abcce9497fa5149c62b4ff3e22d338a30d690f9af970561/rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d08f00679177226c4cb8c5265012eea897c8ca3b93f429e546600c971bcbae7", size = 383795, upload-time = "2025-11-30T20:23:55.556Z" }, + { url = "https://files.pythonhosted.org/packages/05/6f/86f0272b84926bcb0e4c972262f54223e8ecc556b3224d281e6598fc9268/rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5965af57d5848192c13534f90f9dd16464f3c37aaf166cc1da1cae1fd5a34898", size = 393330, upload-time = "2025-11-30T20:23:57.033Z" }, + { url = "https://files.pythonhosted.org/packages/cb/e9/0e02bb2e6dc63d212641da45df2b0bf29699d01715913e0d0f017ee29438/rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9a4e86e34e9ab6b667c27f3211ca48f73dba7cd3d90f8d5b11be56e5dbc3fb4e", size = 518194, upload-time = "2025-11-30T20:23:58.637Z" }, + { url = "https://files.pythonhosted.org/packages/ee/ca/be7bca14cf21513bdf9c0606aba17d1f389ea2b6987035eb4f62bd923f25/rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e5d3e6b26f2c785d65cc25ef1e5267ccbe1b069c5c21b8cc724efee290554419", size = 408340, upload-time = "2025-11-30T20:24:00.2Z" }, + { url = "https://files.pythonhosted.org/packages/c2/c7/736e00ebf39ed81d75544c0da6ef7b0998f8201b369acf842f9a90dc8fce/rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:626a7433c34566535b6e56a1b39a7b17ba961e97ce3b80ec62e6f1312c025551", size = 383765, upload-time = "2025-11-30T20:24:01.759Z" }, + { url = "https://files.pythonhosted.org/packages/4a/3f/da50dfde9956aaf365c4adc9533b100008ed31aea635f2b8d7b627e25b49/rpds_py-0.30.0-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:acd7eb3f4471577b9b5a41baf02a978e8bdeb08b4b355273994f8b87032000a8", size = 396834, upload-time = "2025-11-30T20:24:03.687Z" }, + { url = "https://files.pythonhosted.org/packages/4e/00/34bcc2565b6020eab2623349efbdec810676ad571995911f1abdae62a3a0/rpds_py-0.30.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fe5fa731a1fa8a0a56b0977413f8cacac1768dad38d16b3a296712709476fbd5", size = 415470, upload-time = "2025-11-30T20:24:05.232Z" }, + { url = "https://files.pythonhosted.org/packages/8c/28/882e72b5b3e6f718d5453bd4d0d9cf8df36fddeb4ddbbab17869d5868616/rpds_py-0.30.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:74a3243a411126362712ee1524dfc90c650a503502f135d54d1b352bd01f2404", size = 565630, upload-time = "2025-11-30T20:24:06.878Z" }, + { url = "https://files.pythonhosted.org/packages/3b/97/04a65539c17692de5b85c6e293520fd01317fd878ea1995f0367d4532fb1/rpds_py-0.30.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:3e8eeb0544f2eb0d2581774be4c3410356eba189529a6b3e36bbbf9696175856", size = 591148, upload-time = "2025-11-30T20:24:08.445Z" }, + { url = "https://files.pythonhosted.org/packages/85/70/92482ccffb96f5441aab93e26c4d66489eb599efdcf96fad90c14bbfb976/rpds_py-0.30.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:dbd936cde57abfee19ab3213cf9c26be06d60750e60a8e4dd85d1ab12c8b1f40", size = 556030, upload-time = "2025-11-30T20:24:10.956Z" }, + { url = "https://files.pythonhosted.org/packages/20/53/7c7e784abfa500a2b6b583b147ee4bb5a2b3747a9166bab52fec4b5b5e7d/rpds_py-0.30.0-cp314-cp314t-win32.whl", hash = "sha256:dc824125c72246d924f7f796b4f63c1e9dc810c7d9e2355864b3c3a73d59ade0", size = 211570, upload-time = "2025-11-30T20:24:12.735Z" }, + { url = "https://files.pythonhosted.org/packages/d0/02/fa464cdfbe6b26e0600b62c528b72d8608f5cc49f96b8d6e38c95d60c676/rpds_py-0.30.0-cp314-cp314t-win_amd64.whl", hash = "sha256:27f4b0e92de5bfbc6f86e43959e6edd1425c33b5e69aab0984a72047f2bcf1e3", size = 226532, upload-time = "2025-11-30T20:24:14.634Z" }, +] + [[package]] name = "s3transfer" version = "0.11.3" @@ -1934,6 +2110,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d9/52/1064f510b141bd54025f9b55105e26d1fa970b9be67ad766380a3c9b74b0/starlette-0.50.0-py3-none-any.whl", hash = "sha256:9e5391843ec9b6e472eed1365a78c8098cfceb7a74bfd4d6b1c0c0095efb3bca", size = 74033, upload-time = "2025-11-01T15:25:25.461Z" }, ] +[[package]] +name = "tabulate" +version = "0.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c", size = 81090, upload-time = "2022-10-06T17:21:48.54Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252, upload-time = "2022-10-06T17:21:44.262Z" }, +] + [[package]] name = "threadpoolctl" version = "3.6.0"