diff --git a/doc/data_dictionary.csv b/doc/data_dictionary.csv index a0f3df4..ee5f362 100644 --- a/doc/data_dictionary.csv +++ b/doc/data_dictionary.csv @@ -1,7 +1,7 @@ Column Name,Data Type,Nullable,Example,Description userid,Integer,FALSE,2021010138044459,"A 16 digit unique user identifier number; constructed from registration date, registration iso numeric country code, and uid." -firstname,String,FALSE,kellen,The registered user firstname. -lastname,String,FALSE,mcgregor,The registered user lastname. +first_name,String,FALSE,kellen,The registered user first name. +last_name,String,FALSE,mcgregor,The registered user last name. registration_date,Date,FALSE,2021-01-13,The date the user registered on. registration_country_code,String,FALSE,FR,The user registered country code of residence. uid,Integer,FALSE,2127333684657263,A unique incremental id for the user. @@ -9,7 +9,7 @@ email_domain,String,FALSE,gmail.com,The domain of the registered user email addr device_hash,String,FALSE,5b386290c91e553e,The hashed device id the user used in the transaction. device_type,String,FALSE,Samsung Galaxy A32,The device type of the hashed device id the user used in the transaction. card_hash,String,TRUE,e00643bdd845feba,The hashed card id the user used in the transaction. -card_type,String,TRUE,visa,The card type of the hashed card id the user used in the transaction. +card_type,String,TRUE,Visa,"The card type of the hashed card id the user used in the transaction; one of ['Visa', 'Mastercard']." card_country_code,String,TRUE,ES,The card country code of the hashed card id the user used in the transaction. ip_hash,String,FALSE,1775d41b3788a941,The hashed ip address the user used in the transaction. ip_country_code,String,FALSE,PL,The ip address country code of the hashed ip address the user used in the transaction. @@ -17,7 +17,7 @@ application_hash,String,FALSE,a726fea0a21cfb47,The hashed application id the tra transaction_hash,String,FALSE,ced72c91695a15c6,The hashed id of the transaction. transaction_date,Date,FALSE,2021-01-20,The date of the transaction. transaction_amount,Float,FALSE,1.44,The transaction amount. -transaction_payment_method,String,TRUE,card,"The payment method used to complete the transaction; one of ['card', 'wallet', 'points']" -card_payment_channel,String,TRUE,paypal,"The payment channel the user used to complete the transaction; one of ['paypal', 'adyen', 'appstore', 'worldpay', 'docomo']." -transaction_status,String,FALSE,successful,"The status of the transaction; one of ['successful', 'pending', 'rejected']." +transaction_payment_method,String,TRUE,Card,"The payment method used to complete the transaction; one of ['Card', 'Wallet', 'Points']" +card_payment_channel,String,TRUE,PayPal,"The payment channel the user used to complete the transaction; one of ['PayPal', 'Adyen', 'AppStore', 'Worldpay', 'Docomo']." +transaction_status,String,FALSE,Successful,"The status of the transaction; one of ['Successful', 'Pending', 'Rejected']." transaction_error_code,String,TRUE,E902:AuthenicationFailure,"The associated error code if the transaction was rejected; one of ['E900:ConnectionTimeout', 'E901:SuspectedFraud', 'E902:AuthenicationFailure', 'E903:UserCancelled', 'E904:InsufficientFunds']." diff --git a/doc/entity_relationship_diagram.drawio b/doc/entity_relationship_diagram.drawio index 0632cea..eb67953 100644 --- a/doc/entity_relationship_diagram.drawio +++ b/doc/entity_relationship_diagram.drawio @@ -1,6 +1,6 @@ - + - + @@ -379,7 +379,7 @@ - + @@ -392,7 +392,7 @@ - + diff --git a/doc/entity_relationship_diagram.jpg b/doc/entity_relationship_diagram.jpg index ec65735..961e20c 100644 Binary files a/doc/entity_relationship_diagram.jpg and b/doc/entity_relationship_diagram.jpg differ diff --git a/generator/cons.py b/generator/cons.py index 0f25067..2cfd4cf 100644 --- a/generator/cons.py +++ b/generator/cons.py @@ -56,6 +56,7 @@ default_registration_end_date = (date_today - datetime.timedelta(days=366)).strftime(date_date_strftime) default_transaction_start_date = (date_today - datetime.timedelta(days=365)).strftime(date_date_strftime) default_transaction_end_date = date_today.strftime(date_date_strftime) +default_is_release = False # define default input parameters dictionary default_input_params_dict = { "n_users": default_n_users, diff --git a/generator/exeKaggle.cmd b/generator/exeKaggle.cmd new file mode 100644 index 0000000..05597eb --- /dev/null +++ b/generator/exeKaggle.cmd @@ -0,0 +1 @@ +call uv run main.py --n_users 13000 --use_random_seed 1 --n_itr 2 --is_release \ No newline at end of file diff --git a/generator/exeMain.cmd b/generator/exeMain.cmd index d338dde..edb3a72 100644 --- a/generator/exeMain.cmd +++ b/generator/exeMain.cmd @@ -1,2 +1 @@ -call uv run main.py --n_users 100 --use_random_seed 1 --n_itr 1 -:: call uv run main.py --n_users 13000 --use_random_seed 1 --n_itr 2 \ No newline at end of file +call uv run main.py --n_users 100 --use_random_seed 1 --n_itr 1 \ No newline at end of file diff --git a/generator/main.py b/generator/main.py index 36dde7b..c41cf2a 100644 --- a/generator/main.py +++ b/generator/main.py @@ -57,6 +57,10 @@ def main(input_params_dict: dict): # order results by userid and transaction date ascending user_data = user_data.sort_values(by = 'uid').reset_index(drop = True) trans_data = trans_data.sort_values(by = 'transaction_date').reset_index(drop = True) + # if data is for release drop itr_hash column + if input_params_dict['is_release']: + user_data = user_data.drop(columns=['itr_hash']) + trans_data = trans_data.drop(columns=['itr_hash']) # print out head and shape of data logging.info(f'RandomTeleComUsersData.shape: {user_data.shape}') logging.info(f'RandomTeleComTransData.shape: {trans_data.shape}') diff --git a/generator/qa/Uids.py b/generator/qa/Uids.py index 32b653d..9fcbb80 100644 --- a/generator/qa/Uids.py +++ b/generator/qa/Uids.py @@ -93,7 +93,7 @@ def unique_cards(self): nunique_cards_per_uid = self.data.groupby(by='uid', dropna=False, as_index=False).agg({'card_hash':'nunique'}).sort_values(by=['card_hash']) # test assertions assert nunique_cards_per_uid['card_hash'].max() <= 20 - assert nunique_cards_per_uid['card_hash'].min() == 0 + assert nunique_cards_per_uid['card_hash'].min() >= 0 assert self.data['card_hash'].isnull().any() if self.show_plots: # plot distribution diff --git a/generator/utilities/commandline_interface.py b/generator/utilities/commandline_interface.py index 867f866..ddbbae5 100644 --- a/generator/utilities/commandline_interface.py +++ b/generator/utilities/commandline_interface.py @@ -31,6 +31,8 @@ def commandline_interface() -> Dict[str, object]: The start date for transactions. transaction_end_date : str The end date for transactions. + is_release : bool + Whether the data being generated is for release Returns ------- @@ -48,6 +50,7 @@ def commandline_interface() -> Dict[str, object]: parser.add_argument("--registration_end_date", action="store", dest="registration_end_date", type=str, default=cons.default_registration_end_date, help="String, the end date for registrations",) parser.add_argument("--transaction_start_date", action="store", dest="transaction_start_date", type=str, default=cons.default_transaction_start_date, help="String, the start date for transactions",) parser.add_argument("--transaction_end_date", action="store", dest="transaction_end_date", type=str, default=cons.default_transaction_end_date, help="String, the end date for transactions",) + parser.add_argument("--is_release", action="store_true", dest="is_release", default=cons.default_is_release, help="Bool, whether the data being generated is for release",) # create an output dictionary to hold the results input_params_dict = cons.default_input_params_dict.copy() # extract input arguments @@ -61,4 +64,5 @@ def commandline_interface() -> Dict[str, object]: input_params_dict["registration_end_date"] = args.registration_end_date input_params_dict["transaction_start_date"] = args.transaction_start_date input_params_dict["transaction_end_date"] = args.transaction_end_date + input_params_dict["is_release"] = args.is_release return input_params_dict