From ccf2ea8a7c737102ca0fa5b6b626a92ddeafe26c Mon Sep 17 00:00:00 2001 From: Poulami Das Date: Mon, 23 Sep 2024 13:10:32 +0530 Subject: [PATCH 01/10] changes for html tags --- adi_function_app/pre_embedding_cleaner.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/adi_function_app/pre_embedding_cleaner.py b/adi_function_app/pre_embedding_cleaner.py index 9e3d97b..addb344 100644 --- a/adi_function_app/pre_embedding_cleaner.py +++ b/adi_function_app/pre_embedding_cleaner.py @@ -78,6 +78,9 @@ def clean_text(src_text: str) -> str: } cleaned_text = remove_markdown_tags(src_text, tag_patterns) + # remove html tags + cleaned_text = re.sub(r"<.*?>", "", cleaned_text) + # remove line breaks cleaned_text = re.sub(r"\n", "", cleaned_text) From 1852c3b339d50c09ccb8c5b19ef201a9baf0b4dd Mon Sep 17 00:00:00 2001 From: Poulami Das Date: Mon, 23 Sep 2024 14:53:36 +0530 Subject: [PATCH 02/10] update --- adi_function_app/pre_embedding_cleaner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/adi_function_app/pre_embedding_cleaner.py b/adi_function_app/pre_embedding_cleaner.py index addb344..a073654 100644 --- a/adi_function_app/pre_embedding_cleaner.py +++ b/adi_function_app/pre_embedding_cleaner.py @@ -75,6 +75,7 @@ def clean_text(src_text: str) -> str: "figure": r"
(.*?)
", "figures": r"\(figures/\d+\)(.*?)\(figures/\d+\)", "figcaption": r"
(.*?)
", + "figureidandcontent":r'' } cleaned_text = remove_markdown_tags(src_text, tag_patterns) From 0cf335f53f2e6f16c2e36b5f4e53b907e5317b47 Mon Sep 17 00:00:00 2001 From: Poulami Das Date: Mon, 23 Sep 2024 14:56:39 +0530 Subject: [PATCH 03/10] update --- adi_function_app/pre_embedding_cleaner.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/adi_function_app/pre_embedding_cleaner.py b/adi_function_app/pre_embedding_cleaner.py index a073654..5777de4 100644 --- a/adi_function_app/pre_embedding_cleaner.py +++ b/adi_function_app/pre_embedding_cleaner.py @@ -82,8 +82,11 @@ def clean_text(src_text: str) -> str: # remove html tags cleaned_text = re.sub(r"<.*?>", "", cleaned_text) - # remove line breaks - cleaned_text = re.sub(r"\n", "", cleaned_text) + # Replace newline characters with spaces + cleaned_text = re.sub(r"\n", " ", cleaned_text) + + # Replace multiple whitespace characters with a single space + cleaned_text = re.sub(r"\s+", " ", cleaned_text) # remove stopwords tokens = word_tokenize(cleaned_text, "english") From 9285f5fce1d811668ed24eb67d328abc9f685159 Mon Sep 17 00:00:00 2001 From: Poulami Das Date: Mon, 23 Sep 2024 15:05:50 +0530 Subject: [PATCH 04/10] updated --- adi_function_app/pre_embedding_cleaner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/adi_function_app/pre_embedding_cleaner.py b/adi_function_app/pre_embedding_cleaner.py index 5777de4..f658815 100644 --- a/adi_function_app/pre_embedding_cleaner.py +++ b/adi_function_app/pre_embedding_cleaner.py @@ -75,7 +75,7 @@ def clean_text(src_text: str) -> str: "figure": r"
(.*?)
", "figures": r"\(figures/\d+\)(.*?)\(figures/\d+\)", "figcaption": r"
(.*?)
", - "figureidandcontent":r'' + "figureidandcontent": r'' } cleaned_text = remove_markdown_tags(src_text, tag_patterns) From 1acd2edd16a7f3ffc88d97365357cc7363afa8dd Mon Sep 17 00:00:00 2001 From: Poulami Das Date: Mon, 23 Sep 2024 17:09:23 +0530 Subject: [PATCH 05/10] update --- adi_function_app/pre_embedding_cleaner.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/adi_function_app/pre_embedding_cleaner.py b/adi_function_app/pre_embedding_cleaner.py index f658815..81386d3 100644 --- a/adi_function_app/pre_embedding_cleaner.py +++ b/adi_function_app/pre_embedding_cleaner.py @@ -71,11 +71,10 @@ def clean_text(src_text: str) -> str: try: # Define specific patterns for each tag tag_patterns = { - "figurecontent": r"", + "figurecontent": r"", "figure": r"
(.*?)
", "figures": r"\(figures/\d+\)(.*?)\(figures/\d+\)", - "figcaption": r"
(.*?)
", - "figureidandcontent": r'' + "figcaption": r"
(.*?)
" } cleaned_text = remove_markdown_tags(src_text, tag_patterns) From 36713de8d089aa4d2f052cc57e4aace1fa9d45a7 Mon Sep 17 00:00:00 2001 From: Poulami Das Date: Mon, 23 Sep 2024 17:20:16 +0530 Subject: [PATCH 06/10] update --- adi_function_app/.env | 12 ------------ adi_function_app/local.settings.json | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 12 deletions(-) delete mode 100644 adi_function_app/.env create mode 100644 adi_function_app/local.settings.json diff --git a/adi_function_app/.env b/adi_function_app/.env deleted file mode 100644 index eb0ec41..0000000 --- a/adi_function_app/.env +++ /dev/null @@ -1,12 +0,0 @@ -FunctionApp__ClientId= -IdentityType= # system_assigned or user_assigned or key -OpenAI__ApiKey= -OpenAI__Endpoint= -OpenAI__MultiModalDeployment= -OpenAI__ApiVersion= -AIService__DocumentIntelligence__Endpoint= -AIService__DocumentIntelligence__Key= -AIService__Language__Endpoint= -AIService__Language__Key= -StorageAccount__Endpoint= -StorageAccount__ConnectionString= diff --git a/adi_function_app/local.settings.json b/adi_function_app/local.settings.json new file mode 100644 index 0000000..6d2c10e --- /dev/null +++ b/adi_function_app/local.settings.json @@ -0,0 +1,17 @@ +{ + "IsEncrypted": false, + "Values": { + "FunctionApp__ClientId": "", + "IdentityType": " # system_assigned or user_assigned or key", + "OpenAI__ApiKey": "", + "OpenAI__Endpoint": "", + "OpenAI__MultiModalDeployment": "", + "OpenAI__ApiVersion": "", + "AIService__DocumentIntelligence__Endpoint": "", + "AIService__DocumentIntelligence__Key": "", + "AIService__Language__Endpoint": "", + "AIService__Language__Key": "", + "StorageAccount__Endpoint": "", + "StorageAccount__ConnectionString": "" + } +} \ No newline at end of file From ccbd1c6ac354afc2eeb7a3d5182ad074215596f6 Mon Sep 17 00:00:00 2001 From: Poulami Das Date: Mon, 23 Sep 2024 17:20:56 +0530 Subject: [PATCH 07/10] update --- adi_function_app/.env | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 adi_function_app/.env diff --git a/adi_function_app/.env b/adi_function_app/.env new file mode 100644 index 0000000..429d7c2 --- /dev/null +++ b/adi_function_app/.env @@ -0,0 +1,12 @@ +FunctionApp__ClientId= +IdentityType= # system_assigned or user_assigned or key +OpenAI__ApiKey= +OpenAI__Endpoint= +OpenAI__MultiModalDeployment= +OpenAI__ApiVersion= +AIService__DocumentIntelligence__Endpoint= +AIService__DocumentIntelligence__Key= +AIService__Language__Endpoint= +AIService__Language__Key= +StorageAccount__Endpoint= +StorageAccount__ConnectionString= \ No newline at end of file From cf889a1cfc3cb7ee69e795e66a08a0d17c250f01 Mon Sep 17 00:00:00 2001 From: Poulami Das Date: Mon, 23 Sep 2024 17:46:02 +0530 Subject: [PATCH 08/10] update --- adi_function_app/.env | 2 +- adi_function_app/local.settings.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/adi_function_app/.env b/adi_function_app/.env index 429d7c2..eb0ec41 100644 --- a/adi_function_app/.env +++ b/adi_function_app/.env @@ -9,4 +9,4 @@ AIService__DocumentIntelligence__Key= AIService__Language__Key= StorageAccount__Endpoint= -StorageAccount__ConnectionString= \ No newline at end of file +StorageAccount__ConnectionString= diff --git a/adi_function_app/local.settings.json b/adi_function_app/local.settings.json index 6d2c10e..253f703 100644 --- a/adi_function_app/local.settings.json +++ b/adi_function_app/local.settings.json @@ -14,4 +14,4 @@ "StorageAccount__Endpoint": "", "StorageAccount__ConnectionString": "" } -} \ No newline at end of file +} From 15f639774f7ff8a2cfb9f87ffecc6fc4ca751a9e Mon Sep 17 00:00:00 2001 From: Poulami Das Date: Mon, 23 Sep 2024 17:49:27 +0530 Subject: [PATCH 09/10] Apply pre-commit hook changes --- adi_function_app/local.settings.json | 30 ++++++++++++++-------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/adi_function_app/local.settings.json b/adi_function_app/local.settings.json index 253f703..7bac6ec 100644 --- a/adi_function_app/local.settings.json +++ b/adi_function_app/local.settings.json @@ -1,17 +1,17 @@ { - "IsEncrypted": false, - "Values": { - "FunctionApp__ClientId": "", - "IdentityType": " # system_assigned or user_assigned or key", - "OpenAI__ApiKey": "", - "OpenAI__Endpoint": "", - "OpenAI__MultiModalDeployment": "", - "OpenAI__ApiVersion": "", - "AIService__DocumentIntelligence__Endpoint": "", - "AIService__DocumentIntelligence__Key": "", - "AIService__Language__Endpoint": "", - "AIService__Language__Key": "", - "StorageAccount__Endpoint": "", - "StorageAccount__ConnectionString": "" - } + "IsEncrypted": false, + "Values": { + "AIService__DocumentIntelligence__Endpoint": "", + "AIService__DocumentIntelligence__Key": "", + "AIService__Language__Endpoint": "", + "AIService__Language__Key": "", + "FunctionApp__ClientId": "", + "IdentityType": " # system_assigned or user_assigned or key", + "OpenAI__ApiKey": "", + "OpenAI__ApiVersion": "", + "OpenAI__Endpoint": "", + "OpenAI__MultiModalDeployment": "", + "StorageAccount__ConnectionString": "", + "StorageAccount__Endpoint": "" + } } From 54a7eb8cade1d3c3e78d8c31321198d53cce3d30 Mon Sep 17 00:00:00 2001 From: Poulami Das Date: Mon, 23 Sep 2024 17:53:04 +0530 Subject: [PATCH 10/10] Apply black hook changes --- adi_function_app/pre_embedding_cleaner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/adi_function_app/pre_embedding_cleaner.py b/adi_function_app/pre_embedding_cleaner.py index 81386d3..f6f0a87 100644 --- a/adi_function_app/pre_embedding_cleaner.py +++ b/adi_function_app/pre_embedding_cleaner.py @@ -74,7 +74,7 @@ def clean_text(src_text: str) -> str: "figurecontent": r"", "figure": r"
(.*?)
", "figures": r"\(figures/\d+\)(.*?)\(figures/\d+\)", - "figcaption": r"
(.*?)
" + "figcaption": r"
(.*?)
", } cleaned_text = remove_markdown_tags(src_text, tag_patterns)