Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
39e96d2
Revised bedrock data logic
oislen Feb 12, 2026
0e4b7b2
Fixed bedrock parsing of response. Updated logging. Fixed email_domai…
oislen Feb 13, 2026
34f4d68
Updated exe file for running bedrock data
oislen Feb 13, 2026
5352604
Updated logging and readded file appending logic
oislen Feb 13, 2026
16087f5
Added new reference file for email domain names
oislen Feb 13, 2026
3310771
Added more logging and a unique system prompt for email domain names
oislen Feb 13, 2026
371a17a
Removed population from email domain name reference file
oislen Feb 13, 2026
264995c
Upped to 10 email domain names. Added sort by data point for each cou…
oislen Feb 13, 2026
317209f
Updated llama first name reference file.
oislen Feb 13, 2026
329a54c
Updated llama email domain names
oislen Feb 13, 2026
09d6341
Removed population colunn
oislen Feb 13, 2026
1cc37aa
Refreshed llama reference files
oislen Feb 14, 2026
9ad19db
Added rank logic to llama reference files. Increased n data points lo…
oislen Feb 14, 2026
6398287
Add logic to invert ranks and convert to probability
oislen Feb 15, 2026
306dae9
Added inverted ranks and weighted probabilities by country to referen…
oislen Feb 15, 2026
e5f1f41
Adding 1 to probability calculation to ensure > 0 values
oislen Feb 15, 2026
806b71f
Refreshed llama reference files
oislen Feb 15, 2026
cce3d22
Defaulting to llama email doamins. Applying generate user bedrock dat…
oislen Feb 15, 2026
15fcf81
Defualting to llama email domains
oislen Feb 15, 2026
d734fe5
Updated email domain transaction rejection rates using new llama emai…
oislen Feb 15, 2026
7b7dcbb
Refreshed test data files given latest changes
oislen Feb 15, 2026
16a87a6
Updated unittests given integration of llama email domains reference …
oislen Feb 15, 2026
d068081
Fixed standardise text lambda function to execute for non nan values …
oislen Feb 15, 2026
0aa0fa7
Readded comment for individual countires
oislen Feb 15, 2026
57a39ad
Removed special characters from names
oislen Feb 15, 2026
a78990b
Removed special characters from names
oislen Feb 15, 2026
8e42ff3
Updated test case
oislen Feb 15, 2026
dc922c3
Added power logic to probability calculation
oislen Feb 15, 2026
7a7f7e7
Updated email domain power claculation
oislen Feb 15, 2026
281c381
Updated test data
oislen Feb 15, 2026
451029b
Updated test cases
oislen Feb 15, 2026
68ce7af
Added explicity pillow install to fix trivy vulnerability
oislen Feb 15, 2026
4b3a63c
Merge pull request #63 from oislen/56-update-bedrock-model
oislen Feb 15, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
706 changes: 706 additions & 0 deletions data/ref/llama_email_domains.csv

Large diffs are not rendered by default.

10,787 changes: 5,231 additions & 5,556 deletions data/ref/llama_first_names.csv

Large diffs are not rendered by default.

9,947 changes: 5,160 additions & 4,787 deletions data/ref/llama_last_names.csv

Large diffs are not rendered by default.

Binary file modified data/unittest/transaction_data.parquet
Binary file not shown.
Binary file modified data/unittest/user_data.parquet
Binary file not shown.
3 changes: 1 addition & 2 deletions generator/app/gen_random_telecom_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,7 @@ def gen_random_telecom_data(
fpath_first_names=cons.fpath_llama_first_names,
fpath_last_names=cons.fpath_llama_last_names,
fpath_countries_europe=cons.fpath_countries_europe,
fpath_email_domain=cons.fpath_email_domain,
fpath_bedrock_email_domain=cons.fpath_llama_email_domains
fpath_email_domain=cons.fpath_llama_email_domains,
)

# generate random entity counts for each user
Expand Down
154 changes: 111 additions & 43 deletions generator/batch/gen_bedrock_data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# python generator/batch/gen_bedrock_data.py
# uv run python batch/gen_bedrock_data.py --data_point first_names --run_bedrock
# uv run python batch/gen_bedrock_data.py --data_point last_names --run_bedrock
# uv run python batch/gen_bedrock_data.py --data_point email_domains --run_bedrock

import os
import json
import boto3
from botocore.config import Config
Expand All @@ -9,6 +12,7 @@
import unidecode
import pandas as pd
import numpy as np
import argparse

sys.path.append("E:\\GitHub\\RandomTelecomPayments\\generator")

Expand Down Expand Up @@ -49,23 +53,53 @@
"""

system_email_prompt = """
# Task

You are an email domain name generator for different countries in Europe.
Your task is to generate an arbitrary N number of distinct and varied email domains, for a given European country.

# Requirements

- Generate typical and popular email domains.
- Do not repeat any email domains more than once.
- Each individual email domain must be unique.
- You should return the email domains using a valid JSON object tagged as <answer></answer>.
- The valid JSON object should be of the following structures; `["email domain 1","email domain 2",...,"email domain N"]`.

# Examples

- Generate 2 popular email domain names for people from the country "Germany" -> <answer>["gmail.com","web.de"]</answer>
- Generate 4 popular email domain names for people from the country "United Kingdom" -> <answer>["gmail.com","outlook.com","yahoo.co.uk","btinternet.com"]</answer>
- Generate 3 popular email domain names for people from the country "France" -> <answer>["orange.fr","laposte.net","free.fr"]</answer>
- Generate 5 popular email domain names for people from the country "Spain" -> <answer>["gmail.com","hotmail.es","yahoo.es","outlook.es","telefonica.net"]</answer>
- Generate 6 popular email domain names for people from the country "Sweden" -> <answer>["gmail.com","hotmail.com","outlook.com","telia.com","spray.se","comhem.se"]</answer>
"""

system_name = [{"text":system_name_prompt,}]
system_email = [{"text":system_email_prompt,}]

first_name_prompt = 'Generate {n_data_points} first names for people from the country "{country}"'
surname_prompt = 'Generate {n_data_points} last names for people from the country "{country}"'
email_domain_prompt = 'Generate {n_data_points} popular email domains names for people from the country "{country}"'

bedrock_config = {
"inferenceConfig":{
"maxTokens":8192,
"temperature":0.5,
"topP":0.5,
},
"system":[
{
"text":system_name_prompt
last_name_prompt = 'Generate {n_data_points} last names for people from the country "{country}"'
email_domain_prompt = 'Generate {n_data_points} popular email domain names for people from the country "{country}"'

data_point_prompt_dict = {
"first_names":[first_name_prompt, system_name],
"last_names":[last_name_prompt, system_name],
"email_domains":[email_domain_prompt, system_email]
}

boto3_config = Config(
connect_timeout=60,
read_timeout=300,
retries={
"max_attempts":2,
"mode": "adaptive"
}
]
)

inferenceConfig = {
"maxTokens":8192,
"temperature":0.1,
}

def invoke_bedrock(
Expand All @@ -75,8 +109,6 @@ def invoke_bedrock(
n_data_points:int,
country:str,
countrieseurope:pd.DataFrame,
prompt:str,
system_prompt:str,
country_fpath:str,
) -> tuple[pd.DataFrame, pd.DataFrame]:
"""
Expand Down Expand Up @@ -121,64 +153,69 @@ def invoke_bedrock(
"""
logging.info("Calling Bedrock ...")
# call bedrock model
formatted_prompt = prompt.format(n_data_points=n_data_points, country=country)
formatted_prompt = data_point_prompt_dict[data_point][0].format(n_data_points=n_data_points, country=country)
system = data_point_prompt_dict[data_point][1]
messages = [{"role":"user", "content":[{"text":formatted_prompt}]}]
logging.info(messages)
#model_response = model.prompt(model_id=model_id, user_prompt=formatted_prompt, system_prompt=system_prompt, max_gen_len=2048)
model_response = model.converse(modelId=model_id, messages=messages, system=bedrock_config['system'], inference_config=bedrock_config['inferenceConfig'])
model_response = model.converse(modelId=model_id, messages=messages, system=system, inference_config=inferenceConfig)
# split out answer
text = model_response.split("<answer>")[1].split("</answer>")[0]
text = model_response['output']['message']['content'][0]['text'].split("<answer>")[1].split("</answer>")[0]
# parse json
try:
gen_data_list = json.loads(text)
except json.JSONDecodeError as e:
raise Exception(f"Error parsing JSON: {e}")
logging.info("Processing results ...")
# generate pandas dataframe
gen_dataframe = pd.Series(gen_data_list, name=data_point).drop_duplicates().to_frame()
gen_dataframe = pd.Series(gen_data_list, name=data_point).drop_duplicates().to_frame().reset_index().rename(columns={'index':'rank'})
gen_dataframe['country'] = country
gen_country_dataframe = pd.merge(
left=gen_dataframe,
right=countrieseurope.rename(columns={'name':'country'}),
left_on='country',
right_on='name',
how='inner'
)
gen_country_dataframe = pd.merge(left=gen_dataframe, right=countrieseurope.rename(columns={'name':'country'}), on='country', how='inner')
# standardise names formatting
standardise_text_lambda = lambda x: unidecode.unidecode(" ".join(x.strip())) if pd.isna(x) else x
standardise_text_lambda = lambda x: unidecode.unidecode(" ".join(x.strip().split())) if not pd.isna(x) else x
gen_country_dataframe[data_point] = gen_country_dataframe[data_point].apply(lambda x: standardise_text_lambda(x))
# check against previous iterations
tmp_gen_country_dataframe = pd.DataFrame()
if os.path.exists(country_fpath):
tmp_gen_country_dataframe = pd.read_csv(country_fpath, encoding="utf-8")
# concatenate results
gen_country_dataframe = pd.concat(objs=[gen_country_dataframe, tmp_gen_country_dataframe], axis=0, ignore_index=True)
# deduplicate data
groupby_cols = [data_point,"country","ISO numeric"]
agg_dict = {"rank":"mean"}
gen_country_dataframe = gen_country_dataframe.dropna().groupby(groupby_cols, as_index=False).agg(agg_dict).sort_values(by=groupby_cols)
logging.info(f"gen_country_dataframe.shape: {gen_country_dataframe.shape}")
# save generated data
gen_country_dataframe.to_csv(country_fpath, index=False, encoding="latin1")
gen_country_dataframe.to_csv(country_fpath, index=False, encoding="utf-8")
logging.info(f"Wrote {country_fpath} ...")
return gen_country_dataframe

def main(bedrock, model_id, data_point, fpath_dict, run_bedrock=False):
"""
Docstring for main
"""
logging.info(f"data_point:{data_point} ...")
# load countries, first_names and surnames files
countrieseurope = pd.read_csv(cons.fpath_countries_europe, usecols=['name', 'ISO numeric'])
countrieseurope = pd.read_csv(cons.fpath_countries_europe, usecols=['name', 'ISO numeric', 'population'])
n_countries = countrieseurope.shape[0]
# set lists to collect generated data with
gen_country_dataframe_list, error_countries = [], []
# set countries list
#countries_list = countrieseurope['name'].to_list()
countries_list = ['Cyprus']
countries_list = countrieseurope['name'].to_list()
#countries_list = ['Ireland']
# iterate over countries list
for country in countries_list:
logging.info(f"{country} ...")
country_fpath=fpath_dict['country_fpath'].format(country)
logging.info(f"country:{country} ...")
country_fpath=fpath_dict['country_fpath'].format(country=country)
try:
if run_bedrock:
# call bedrock model and generate user names data
country_filter = (countrieseurope["name"] == country)
country_population = countrieseurope.loc[country_filter, "population"].iloc[0]
# set n data points for ai generator depending on type
if data_point in ("first_names", "last_names"):
n_data_points = int(np.log(country_population)**1.5)
n_data_points = int(np.log(country_population)**1.75)
elif data_point == "email_domains":
n_data_points = 5
n_data_points = 15
else:
raise ValueError(f"Invalid parameter data_point value {data_point}")
# invoke bedrock and generate data points
Expand All @@ -196,6 +233,7 @@ def main(bedrock, model_id, data_point, fpath_dict, run_bedrock=False):
time.sleep(20)
else:
tmp_gen_country_data = pd.read_csv(country_fpath, encoding="latin1")
logging.info(f"tmp_gen_country_data.shape:{tmp_gen_country_data.shape}")
# append to user country data
gen_country_dataframe_list.append(tmp_gen_country_data)
except Exception as e:
Expand All @@ -204,25 +242,50 @@ def main(bedrock, model_id, data_point, fpath_dict, run_bedrock=False):
# log if any countries failed to generate data
if len(error_countries) > 0:
logging.info(f"Failed to generated data for countries: {error_countries}")
logging.info(f"Concatenating Country files ...")
# concatenate user country data together and deduplicate across first_names and countries
output_gen_country_dataframe = pd.concat(gen_country_dataframe_list, axis=0, ignore_index=True)
# invert the index ranks and then convert to probability weightings within countries
invert_rank_lambda = lambda group: group['rank'].max() - group['rank']
conv_proba_lambda = lambda group, power: (group['inverse_rank'].pow(power) + 1) / (group['inverse_rank'].pow(power).sum() + 1)
output_gen_country_dataframe['inverse_rank'] = output_gen_country_dataframe.groupby(by=["country"], as_index=False, group_keys=False).apply(invert_rank_lambda, include_groups=False)
if data_point in ("first_names", "last_names"):
output_gen_country_dataframe['probability'] = output_gen_country_dataframe.groupby(by=["country"], as_index=False, group_keys=False).apply(conv_proba_lambda, power=1, include_groups=False)
elif data_point == "email_domains":
output_gen_country_dataframe['probability'] = output_gen_country_dataframe.groupby(by=["country"], as_index=False, group_keys=False).apply(conv_proba_lambda, power=1.5, include_groups=False)
else:
raise ValueError(f"Invalid parameter data_point value {data_point}")
# sort and deduplicate output data
sort_dedup_cols = ["country",data_point]
output_gen_country_dataframe = output_gen_country_dataframe.drop_duplicates(subset=sort_dedup_cols).sort_values(by=sort_dedup_cols)
# write data to disk
if output_gen_country_dataframe['country'].nunique() == n_countries:
logging.info(f"Writing reference file: {fpath_dict['fpath']}")
output_gen_country_dataframe.to_csv(fpath_dict["fpath"], index=False, encoding="utf-8")
logging.info(f"output_gen_country_dataframe.shape: {output_gen_country_dataframe.shape}")
output_gen_country_dataframe.to_csv(fpath_dict["fpath"], index=False, encoding="latin1")
else:
logging.info("WARNING Insufficient first name data generated.")
logging.info(f"WARNING Insufficient {data_point} data generated.")

lgr = logging.getLogger()
lgr.setLevel(logging.INFO)

if __name__ == "__main__":
# define argument parser object
parser = argparse.ArgumentParser(description="Execute Random TeleCom Data Programme.")
# add input arguments
parser.add_argument("--data_point", action="store", dest="data_point", type=str, choices=list(cons.llama_data_point_fpaths.keys()), help="String, the data point to generate Bedrock data for.",)
parser.add_argument("--run_bedrock", action=argparse.BooleanOptionalAction, dest="run_bedrock", type=bool, default=False, help="Boolean, whether to generate data by calling Bedrock",)
# extract input arguments
args = parser.parse_args()
data_point = args.data_point
run_bedrock = args.run_bedrock
logging.info(f"data_point: {data_point}")
logging.info(f"run_bedrock: {run_bedrock}")
# set aws region
aws_region = "us-east-1"
model_id="us.meta.llama3-1-70b-instruct-v1:0"
model_id="us.anthropic.claude-sonnet-4-5-20250929-v1:0"
logging.info(f"aws_region: {aws_region}")
logging.info(f"model_id: {model_id}")
# load aws config
with open(cons.fpath_aws_session_token, "r") as j:
aws_config = json.loads(j.read())
Expand All @@ -236,11 +299,16 @@ def main(bedrock, model_id, data_point, fpath_dict, run_bedrock=False):
bedrock_runtime = session.client(
service_name="bedrock-runtime",
region_name=aws_region,
config=Config(retries={"max_attempts":1, "mode": "adaptive"})
config=boto3_config
)
# create bedrock instance
bedrock = Bedrock(bedrock_runtime=bedrock_runtime)
# execute main programme
for data_point, fpath_dict in cons.llama_data_point_fpaths.items():
main(bedrock=bedrock, model_id=model_id, data_point=data_point, fpath_dict=fpath_dict, run_bedrock=True)
main(
bedrock=bedrock,
model_id=model_id,
data_point=data_point,
fpath_dict=cons.llama_data_point_fpaths[data_point],
run_bedrock=run_bedrock
)

2 changes: 1 addition & 1 deletion generator/cons.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
llama_data_point_fpaths = {
"first_names":{"fpath":fpath_llama_first_names, "country_fpath":fpath_temp_llama_first_names},
"last_names":{"fpath":fpath_llama_last_names, "country_fpath":fpath_temp_llama_last_names},
"email_domain":{"fpath":fpath_llama_email_domains, "country_fpath":fpath_temp_llama_email_domains}
"email_domains":{"fpath":fpath_llama_email_domains, "country_fpath":fpath_temp_llama_email_domains}
}

# set url links to files available online
Expand Down
3 changes: 3 additions & 0 deletions generator/exeBedrockData.cmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
call uv run python batch\gen_bedrock_data.py --data_point first_names --run_bedrock
call uv run python batch\gen_bedrock_data.py --data_point last_names --run_bedrock
call uv run python batch\gen_bedrock_data.py --data_point email_domains --run_bedrock
1 change: 0 additions & 1 deletion generator/exeGenNames.cmd

This file was deleted.

Loading