oislen · oislen · Feb 15, 2026 · Feb 12, 2026 · Feb 13, 2026 · Feb 13, 2026
diff --git a/data/ref/llama_email_domains.csv b/data/ref/llama_email_domains.csv
diff --git a/data/ref/llama_first_names.csv b/data/ref/llama_first_names.csv
diff --git a/data/ref/llama_last_names.csv b/data/ref/llama_last_names.csv
diff --git a/data/unittest/transaction_data.parquet b/data/unittest/transaction_data.parquet
diff --git a/data/unittest/user_data.parquet b/data/unittest/user_data.parquet
diff --git a/generator/app/gen_random_telecom_data.py b/generator/app/gen_random_telecom_data.py
@@ -75,8 +75,7 @@ def gen_random_telecom_data(
         fpath_first_names=cons.fpath_llama_first_names,
         fpath_last_names=cons.fpath_llama_last_names,
         fpath_countries_europe=cons.fpath_countries_europe,
-        fpath_email_domain=cons.fpath_email_domain,
-        fpath_bedrock_email_domain=cons.fpath_llama_email_domains
+        fpath_email_domain=cons.fpath_llama_email_domains,
         )
 
     # generate random entity counts for each user

diff --git a/generator/batch/gen_bedrock_data.py b/generator/batch/gen_bedrock_data.py
@@ -1,5 +1,8 @@
-# python generator/batch/gen_bedrock_data.py
+# uv run python batch/gen_bedrock_data.py --data_point first_names --run_bedrock
+# uv run python batch/gen_bedrock_data.py --data_point last_names --run_bedrock
+# uv run python batch/gen_bedrock_data.py --data_point email_domains --run_bedrock
 
+import os
 import json
 import boto3
 from botocore.config import Config
@@ -9,6 +12,7 @@
 import unidecode
 import pandas as pd
 import numpy as np
+import argparse
 
 sys.path.append("E:\\GitHub\\RandomTelecomPayments\\generator")
 
@@ -49,23 +53,53 @@
 """
 
 system_email_prompt = """
+# Task
+
+You are an email domain name generator for different countries in Europe.
+Your task is to generate an arbitrary N number of distinct and varied email domains, for a given European country.
+
+# Requirements
+
+- Generate typical and popular email domains.
+- Do not repeat any email domains more than once.
+- Each individual email domain must be unique.
+- You should return the email domains using a valid JSON object tagged as <answer></answer>.
+- The valid JSON object should be of the following structures; `["email domain 1","email domain 2",...,"email domain N"]`.
+
+# Examples
+
+- Generate 2 popular email domain names for people from the country "Germany" -> <answer>["gmail.com","web.de"]</answer>
+- Generate 4 popular email domain names for people from the country "United Kingdom" -> <answer>["gmail.com","outlook.com","yahoo.co.uk","btinternet.com"]</answer>
+- Generate 3 popular email domain names for people from the country "France" -> <answer>["orange.fr","laposte.net","free.fr"]</answer>
+- Generate 5 popular email domain names for people from the country "Spain" -> <answer>["gmail.com","hotmail.es","yahoo.es","outlook.es","telefonica.net"]</answer>
+- Generate 6 popular email domain names for people from the country "Sweden" -> <answer>["gmail.com","hotmail.com","outlook.com","telia.com","spray.se","comhem.se"]</answer>
 """
 
+system_name = [{"text":system_name_prompt,}]
+system_email = [{"text":system_email_prompt,}]
+
 first_name_prompt = 'Generate {n_data_points} first names for people from the country "{country}"'
-surname_prompt = 'Generate {n_data_points} last names for people from the country "{country}"'
-email_domain_prompt = 'Generate {n_data_points} popular email domains names for people from the country "{country}"'
-
-bedrock_config = {
-    "inferenceConfig":{
-        "maxTokens":8192,
-        "temperature":0.5,
-        "topP":0.5,
-    },
-    "system":[
-        {
-            "text":system_name_prompt
+last_name_prompt = 'Generate {n_data_points} last names for people from the country "{country}"'
+email_domain_prompt = 'Generate {n_data_points} popular email domain names for people from the country "{country}"'
+
+data_point_prompt_dict = {
+    "first_names":[first_name_prompt, system_name],
+    "last_names":[last_name_prompt, system_name],
+    "email_domains":[email_domain_prompt, system_email]
+}
+
+boto3_config = Config(
+    connect_timeout=60,
+    read_timeout=300,
+    retries={
+        "max_attempts":2,
+        "mode": "adaptive"
         }
-    ]
+    )
+
+inferenceConfig = {
+    "maxTokens":8192,
+    "temperature":0.1,
 }
 
 def invoke_bedrock(
@@ -75,8 +109,6 @@ def invoke_bedrock(
     n_data_points:int,
     country:str,
     countrieseurope:pd.DataFrame,
-    prompt:str,
-    system_prompt:str,
     country_fpath:str,
     ) -> tuple[pd.DataFrame, pd.DataFrame]:
     """
@@ -121,64 +153,69 @@ def invoke_bedrock(
     """
     logging.info("Calling Bedrock ...")
     # call bedrock model
-    formatted_prompt = prompt.format(n_data_points=n_data_points, country=country)
+    formatted_prompt = data_point_prompt_dict[data_point][0].format(n_data_points=n_data_points, country=country)
+    system = data_point_prompt_dict[data_point][1]
     messages = [{"role":"user", "content":[{"text":formatted_prompt}]}]
     logging.info(messages)
-    #model_response = model.prompt(model_id=model_id, user_prompt=formatted_prompt, system_prompt=system_prompt, max_gen_len=2048)
-    model_response = model.converse(modelId=model_id, messages=messages, system=bedrock_config['system'], inference_config=bedrock_config['inferenceConfig'])
+    model_response = model.converse(modelId=model_id, messages=messages, system=system, inference_config=inferenceConfig)
     # split out answer
-    text = model_response.split("<answer>")[1].split("</answer>")[0]
+    text = model_response['output']['message']['content'][0]['text'].split("<answer>")[1].split("</answer>")[0]
     # parse json
     try:
         gen_data_list = json.loads(text)
     except json.JSONDecodeError as e:
         raise Exception(f"Error parsing JSON: {e}")
     logging.info("Processing results ...")
     # generate pandas dataframe
-    gen_dataframe = pd.Series(gen_data_list, name=data_point).drop_duplicates().to_frame()
+    gen_dataframe = pd.Series(gen_data_list, name=data_point).drop_duplicates().to_frame().reset_index().rename(columns={'index':'rank'})
     gen_dataframe['country'] = country
-    gen_country_dataframe = pd.merge(
-        left=gen_dataframe,
-        right=countrieseurope.rename(columns={'name':'country'}),
-        left_on='country',
-        right_on='name',
-        how='inner'
-        )
+    gen_country_dataframe = pd.merge(left=gen_dataframe, right=countrieseurope.rename(columns={'name':'country'}), on='country', how='inner')
     # standardise names formatting
-    standardise_text_lambda = lambda x: unidecode.unidecode(" ".join(x.strip())) if pd.isna(x) else x
+    standardise_text_lambda = lambda x: unidecode.unidecode(" ".join(x.strip().split())) if not pd.isna(x) else x
     gen_country_dataframe[data_point] = gen_country_dataframe[data_point].apply(lambda x: standardise_text_lambda(x))
+    # check against previous iterations
+    tmp_gen_country_dataframe = pd.DataFrame()
+    if os.path.exists(country_fpath):
+        tmp_gen_country_dataframe = pd.read_csv(country_fpath, encoding="utf-8")
+    # concatenate results
+    gen_country_dataframe = pd.concat(objs=[gen_country_dataframe, tmp_gen_country_dataframe], axis=0, ignore_index=True)
+    # deduplicate data
+    groupby_cols = [data_point,"country","ISO numeric"]
+    agg_dict = {"rank":"mean"}
+    gen_country_dataframe = gen_country_dataframe.dropna().groupby(groupby_cols, as_index=False).agg(agg_dict).sort_values(by=groupby_cols)
     logging.info(f"gen_country_dataframe.shape: {gen_country_dataframe.shape}")
     # save generated data
-    gen_country_dataframe.to_csv(country_fpath, index=False, encoding="latin1")
+    gen_country_dataframe.to_csv(country_fpath, index=False, encoding="utf-8")
     logging.info(f"Wrote {country_fpath} ...")
     return gen_country_dataframe
 
 def main(bedrock, model_id, data_point, fpath_dict, run_bedrock=False):
     """
     Docstring for main
     """
+    logging.info(f"data_point:{data_point} ...")
     # load countries, first_names and surnames files
-    countrieseurope = pd.read_csv(cons.fpath_countries_europe, usecols=['name', 'ISO numeric'])
+    countrieseurope = pd.read_csv(cons.fpath_countries_europe, usecols=['name', 'ISO numeric', 'population'])
     n_countries = countrieseurope.shape[0]
     # set lists to collect generated data with
     gen_country_dataframe_list, error_countries = [], []
     # set countries list
-    #countries_list = countrieseurope['name'].to_list()
-    countries_list = ['Cyprus']
+    countries_list = countrieseurope['name'].to_list()
+    #countries_list = ['Ireland']
     # iterate over countries list
     for country in countries_list:
-        logging.info(f"{country} ...")
-        country_fpath=fpath_dict['country_fpath'].format(country)
+        logging.info(f"country:{country} ...")
+        country_fpath=fpath_dict['country_fpath'].format(country=country)
         try:
             if run_bedrock:
                 # call bedrock model and generate user names data
                 country_filter = (countrieseurope["name"] == country)
                 country_population = countrieseurope.loc[country_filter, "population"].iloc[0]
                 # set n data points for ai generator depending on type
                 if data_point in ("first_names", "last_names"):
-                    n_data_points = int(np.log(country_population)**1.5)
+                    n_data_points = int(np.log(country_population)**1.75)
                 elif data_point == "email_domains":
-                    n_data_points = 5
+                    n_data_points = 15
                 else:
                     raise ValueError(f"Invalid parameter data_point value {data_point}")
                 # invoke bedrock and generate data points
@@ -196,6 +233,7 @@ def main(bedrock, model_id, data_point, fpath_dict, run_bedrock=False):
                 time.sleep(20)
             else:
                 tmp_gen_country_data = pd.read_csv(country_fpath, encoding="latin1")
+                logging.info(f"tmp_gen_country_data.shape:{tmp_gen_country_data.shape}")
             # append to user country data
             gen_country_dataframe_list.append(tmp_gen_country_data)
         except Exception as e:
@@ -204,25 +242,50 @@ def main(bedrock, model_id, data_point, fpath_dict, run_bedrock=False):
     # log if any countries failed to generate data
     if len(error_countries) > 0:
         logging.info(f"Failed to generated data for countries: {error_countries}")
+    logging.info(f"Concatenating Country files ...")
     # concatenate user country data together and deduplicate across first_names and countries
     output_gen_country_dataframe = pd.concat(gen_country_dataframe_list, axis=0, ignore_index=True)
+    # invert the index ranks and then convert to probability weightings within countries
+    invert_rank_lambda = lambda group: group['rank'].max() - group['rank']
+    conv_proba_lambda = lambda group, power: (group['inverse_rank'].pow(power) + 1) / (group['inverse_rank'].pow(power).sum() + 1)
+    output_gen_country_dataframe['inverse_rank'] = output_gen_country_dataframe.groupby(by=["country"], as_index=False, group_keys=False).apply(invert_rank_lambda, include_groups=False)
+    if data_point in ("first_names", "last_names"):
+        output_gen_country_dataframe['probability'] = output_gen_country_dataframe.groupby(by=["country"], as_index=False, group_keys=False).apply(conv_proba_lambda, power=1, include_groups=False)
+    elif data_point == "email_domains":
+        output_gen_country_dataframe['probability'] = output_gen_country_dataframe.groupby(by=["country"], as_index=False, group_keys=False).apply(conv_proba_lambda, power=1.5, include_groups=False)
+    else:
+        raise ValueError(f"Invalid parameter data_point value {data_point}")
     # sort and deduplicate output data
     sort_dedup_cols = ["country",data_point]
     output_gen_country_dataframe = output_gen_country_dataframe.drop_duplicates(subset=sort_dedup_cols).sort_values(by=sort_dedup_cols)
     # write data to disk
     if output_gen_country_dataframe['country'].nunique() == n_countries:
+        logging.info(f"Writing reference file: {fpath_dict['fpath']}")
+        output_gen_country_dataframe.to_csv(fpath_dict["fpath"], index=False, encoding="utf-8")
         logging.info(f"output_gen_country_dataframe.shape: {output_gen_country_dataframe.shape}")
-        output_gen_country_dataframe.to_csv(fpath_dict["fpath"], index=False, encoding="latin1")
     else:
-        logging.info("WARNING Insufficient first name data generated.")
+        logging.info(f"WARNING Insufficient {data_point} data generated.")
 
 lgr = logging.getLogger()
 lgr.setLevel(logging.INFO)
 
 if __name__ == "__main__":
+    # define argument parser object
+    parser = argparse.ArgumentParser(description="Execute Random TeleCom Data Programme.")
+    # add input arguments
+    parser.add_argument("--data_point", action="store", dest="data_point", type=str, choices=list(cons.llama_data_point_fpaths.keys()), help="String, the data point to generate Bedrock data for.",)
+    parser.add_argument("--run_bedrock", action=argparse.BooleanOptionalAction, dest="run_bedrock", type=bool, default=False, help="Boolean, whether to generate data by calling Bedrock",)
+    # extract input arguments
+    args = parser.parse_args()
+    data_point = args.data_point
+    run_bedrock = args.run_bedrock
+    logging.info(f"data_point: {data_point}")
+    logging.info(f"run_bedrock: {run_bedrock}")
     # set aws region
     aws_region = "us-east-1"
-    model_id="us.meta.llama3-1-70b-instruct-v1:0"
+    model_id="us.anthropic.claude-sonnet-4-5-20250929-v1:0"
+    logging.info(f"aws_region: {aws_region}")
+    logging.info(f"model_id: {model_id}")
     # load aws config
     with open(cons.fpath_aws_session_token, "r") as j:
         aws_config = json.loads(j.read())
@@ -236,11 +299,16 @@ def main(bedrock, model_id, data_point, fpath_dict, run_bedrock=False):
     bedrock_runtime = session.client(
         service_name="bedrock-runtime",
         region_name=aws_region,
-        config=Config(retries={"max_attempts":1, "mode": "adaptive"})
+        config=boto3_config
         )
     # create bedrock instance
     bedrock = Bedrock(bedrock_runtime=bedrock_runtime)
     # execute main programme
-    for data_point, fpath_dict in cons.llama_data_point_fpaths.items():
-        main(bedrock=bedrock, model_id=model_id, data_point=data_point, fpath_dict=fpath_dict, run_bedrock=True)
+    main(
+        bedrock=bedrock,
+        model_id=model_id,
+        data_point=data_point,
+        fpath_dict=cons.llama_data_point_fpaths[data_point],
+        run_bedrock=run_bedrock
+    )
 
diff --git a/generator/cons.py b/generator/cons.py
@@ -34,7 +34,7 @@
 llama_data_point_fpaths = {
     "first_names":{"fpath":fpath_llama_first_names, "country_fpath":fpath_temp_llama_first_names},
     "last_names":{"fpath":fpath_llama_last_names, "country_fpath":fpath_temp_llama_last_names},
-    "email_domain":{"fpath":fpath_llama_email_domains, "country_fpath":fpath_temp_llama_email_domains}
+    "email_domains":{"fpath":fpath_llama_email_domains, "country_fpath":fpath_temp_llama_email_domains}
     }
 
 # set url links to files available online

diff --git a/generator/exeBedrockData.cmd b/generator/exeBedrockData.cmd
@@ -0,0 +1,3 @@
+call uv run python batch\gen_bedrock_data.py --data_point first_names --run_bedrock
+call uv run python batch\gen_bedrock_data.py --data_point last_names --run_bedrock
+call uv run python batch\gen_bedrock_data.py --data_point email_domains --run_bedrock
diff --git a/generator/exeGenNames.cmd b/generator/exeGenNames.cmd