Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
e71c776
Moved .qmd report scripts to qmarkdown subdirectory. Added .cmd file …
oislen Feb 1, 2026
7462744
Added relative path to requirements.txt file
oislen Feb 1, 2026
092c43c
Added nb dependencies for generating qmarkdown reports
oislen Feb 1, 2026
ec55d3e
Added assertions to qa report
oislen Feb 1, 2026
dd037b9
Added tabulate to requirements
oislen Feb 9, 2026
8d242cc
Revised distribution plot and added introductory notes
oislen Feb 9, 2026
6098eb1
Added assertions for remaining qa checks
oislen Feb 9, 2026
50380a5
Standardised naming conventions
oislen Feb 9, 2026
d61b505
Moved all utility functions from qmarkdown reports to separate sub-di…
oislen Feb 11, 2026
d52c70a
Revised feature engineering and customer value score qmarkdown reports
oislen Feb 12, 2026
11e1d76
Merge pull request #64 from oislen/dev
oislen Feb 15, 2026
9848fc4
Merged updated bedrock branch and resolved conflicts
oislen Feb 16, 2026
75455e9
Pointing towards new archived version
oislen Feb 16, 2026
75e75f9
Added trivy vulnerability patches
oislen Feb 16, 2026
0857d00
Started functialising QA assertion logic
oislen Feb 17, 2026
b8bd1bf
Added qa classes for ips, transactions and card
oislen Feb 21, 2026
578a7cc
Calling qa class objects
oislen Feb 21, 2026
1037fa2
Added logic to show plots and or logs
oislen Feb 21, 2026
a15803d
Calling qa checks at end of main process
oislen Feb 21, 2026
58a3b19
Added logging and showing plots and logs
oislen Feb 21, 2026
d74ca44
Reviewed qmarkdown reports
oislen Feb 21, 2026
639c421
Fixed bugs in report scripts
oislen Feb 21, 2026
7009c1d
Fixed bugs in report scripts
oislen Feb 21, 2026
c2bfc19
Fixed bugs in report scripts
oislen Feb 21, 2026
804915f
Fixed directories and apply qa in main after data is written to disk
oislen Feb 21, 2026
4aac836
Updated max range checks for uids qa
oislen Feb 21, 2026
8eca505
Updated min range checks for uids qa
oislen Feb 21, 2026
6c6ac4c
Added qa check of registration date vs transaction date
oislen Feb 21, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ ENV PYTHON_VERSION=${PYTHON_VERSION}
# install required software and programmes for development environment
RUN apt-get update
RUN apt-get install -y apt-utils vim curl wget unzip tree htop adduser
RUN apt-get install -y imagemagick=8:7.1.1.43+dfsg1-1+deb13u5 libssl-dev=3.5.4-1~deb13u2
RUN apt-get install -y imagemagick=8:7.1.1.43+dfsg1-1+deb13u5 libssl-dev=3.5.4-1~deb13u2 libpq-dev=17.8-0+deb13u1 linux-libc-dev=6.12.69-1

# set up home environment
RUN adduser ${user}
Expand Down
2 changes: 1 addition & 1 deletion config/uv/RandomTelecomPayments.cmd
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
:: call powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex"
call uv add -r requirements.txt --link-mode=copy
call uv add -r ..\..\requirements.txt --link-mode=copy
2 changes: 1 addition & 1 deletion generator/cons.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# set data files
fpath_randomtelecomtransdata = os.path.join(subdir_data,'RandomTelecomPayments.csv')
fpath_randomtelecomusersdata = os.path.join(subdir_data,'RandomTelecomUsers.parquet')
fpath_arch_randomtelecomdata = os.path.join(subdir_data, 'arch', 'RandomTelecomPayments.csv')
fpath_arch_randomtelecomdata = os.path.join(subdir_data, 'arch', 'RandomTelecomPaymentsV0.3.csv')
fpath_temp_llama_first_names = os.path.join(subdir_data, 'temp', 'llama_first_names_{country}.csv')
fpath_temp_llama_last_names = os.path.join(subdir_data, 'temp', 'llama_last_names_{country}.csv')
fpath_temp_llama_email_domains = os.path.join(subdir_data, 'temp', 'llama_email_domains_{country}.csv')
Expand Down
14 changes: 10 additions & 4 deletions generator/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from utilities.input_error_handling import input_error_handling
from utilities.multiprocess import multiprocess
from app.gen_random_telecom_data import gen_random_telecom_data
import qa

def main(input_params_dict: dict):
"""
Expand Down Expand Up @@ -56,10 +57,6 @@ def main(input_params_dict: dict):
# order results by userid and transaction date ascending
user_data = user_data.sort_values(by = 'uid').reset_index(drop = True)
trans_data = trans_data.sort_values(by = 'transaction_date').reset_index(drop = True)
# end timer
t1 = time()
total_runtime_seconds = round(t1 - t0, 2)
logging.info(f'Total Runtime: {total_runtime_seconds} seconds')
# print out head and shape of data
logging.info(f'RandomTeleComUsersData.shape: {user_data.shape}')
logging.info(f'RandomTeleComTransData.shape: {trans_data.shape}')
Expand All @@ -73,6 +70,15 @@ def main(input_params_dict: dict):
logging.info(f'Writing output trans level random telecoms data to: {cons.fpath_randomtelecomtransdata}')
user_data.to_parquet(cons.fpath_randomtelecomusersdata, engine='fastparquet')
trans_data.to_csv(cons.fpath_randomtelecomtransdata, index = False)
# initialise QA objects
qa.Uids(data=trans_data).run_all()
qa.Transactions(data=trans_data).run_all()
qa.Cards(data=trans_data).run_all()
qa.Ips(data=trans_data).run_all()
# end timer
t1 = time()
total_runtime_seconds = round(t1 - t0, 2)
logging.info(f'Total Runtime: {total_runtime_seconds} seconds')
# return dataframes as dictionary
return {"user_data": user_data, "trans_data": trans_data}

Expand Down
51 changes: 51 additions & 0 deletions generator/qa/Cards.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import logging
import seaborn as sns

class Cards():

def __init__(self, data, show_logs=False, show_plots=False):
"""
"""
logging.info("Initialising Cards QA")
self.data = data
self.show_logs = show_logs
self.show_plots = show_plots

def unique_card_types(self):
"""
"""
nunique_card_types_per_card = self.data.groupby(by=['card_hash'], as_index=False, dropna=False).agg({'card_type':'nunique'})
card_type_payment_totals = self.data.groupby(by=['card_type', 'transaction_payment_method'], as_index=False, dropna=False).agg({'transaction_hash':'size', 'transaction_amount':'sum'})
# test assertions
assert nunique_card_types_per_card['card_type'].max() == 1
assert nunique_card_types_per_card['card_type'].min() == 0
assert not nunique_card_types_per_card['card_type'].isnull().any()
if self.show_plots:
# plot distribution
sns.histplot(data=nunique_card_types_per_card,x='card_type', bins = 20)
if self.show_logs:
# show logs
logging.info(card_type_payment_totals)

def unique_country_codes(self):
"""
"""
nunique_country_codes_per_card = self.data.groupby(by=['card_hash'], as_index=False, dropna=False).agg({'card_country_code':'nunique'})
card_country_code_payment_totals = self.data.assign(card_country_code=self.data['card_country_code'].notnull().astype(int)).groupby(by=['card_country_code', 'transaction_payment_method'], as_index=False, dropna=False).agg({'transaction_hash':'size', 'transaction_amount':'sum'})
# test assertions
assert nunique_country_codes_per_card['card_country_code'].max() == 1
assert nunique_country_codes_per_card['card_country_code'].min() == 0
assert not nunique_country_codes_per_card['card_country_code'].isnull().any()
if self.show_plots:
# plot distribution
sns.histplot(data=nunique_country_codes_per_card,x='card_country_code', bins = 20)
if self.show_logs:
# show logs
logging.info(card_country_code_payment_totals)

def run_all(self):
"""
"""
self.unique_card_types()
self.unique_country_codes()
logging.info("All Cards QA checks passed.")
34 changes: 34 additions & 0 deletions generator/qa/Ips.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import logging
import seaborn as sns

class Ips():

def __init__(self, data, show_logs=False, show_plots=False):
"""
"""
logging.info("Initialising Ips QA")
self.data = data
self.show_logs = show_logs
self.show_plots = show_plots

def unique_country_codes(self):
"""
"""
nunique_country_codes_per_ip = self.data.groupby(by=['ip_hash'], as_index=False, dropna=False).agg({'ip_country_code':'nunique'})
ip_country_codes_totals = self.data.groupby(by=['ip_country_code'], as_index=False, dropna=False).size()
# test assertions
assert nunique_country_codes_per_ip['ip_country_code'].max() == 1
assert nunique_country_codes_per_ip['ip_country_code'].min() == 1
assert not nunique_country_codes_per_ip['ip_country_code'].isnull().any()
if self.show_plots:
# plot distribution
sns.histplot(data=nunique_country_codes_per_ip,x='ip_country_code', bins = 20)
if self.show_logs:
# show logs
logging.info(ip_country_codes_totals.to_markdown())

def run_all(self):
"""
"""
self.unique_country_codes()
logging.info("All Ips QA checks passed.")
Loading