From 622491c9ae25e55540f1c63c146b011055a49cfe Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Thu, 31 Oct 2024 12:49:21 +0000 Subject: [PATCH] Filter out empty pages when using page wise chunking --- adi_function_app/adi_2_ai_search.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/adi_function_app/adi_2_ai_search.py b/adi_function_app/adi_2_ai_search.py index 7e8ff27..50cee7d 100644 --- a/adi_function_app/adi_2_ai_search.py +++ b/adi_function_app/adi_2_ai_search.py @@ -92,7 +92,8 @@ def update_figure_description( """ # Define the new string to replace the old content - new_string = f'' + new_string = f"""""" # Calculate the end index of the content to be replaced end_index = offset + length @@ -537,9 +538,6 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) -> result, operation_id = await analyse_document(temp_file_path) except ValueError as inner_e: logging.error(inner_e) - logging.error( - f"Failed to analyze the document with Azure Document Intelligence: {e}" - ) logging.error( "Failed to analyse %s with Azure Document Intelligence.", blob ) @@ -607,7 +605,14 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) -> ) } for future in concurrent.futures.as_completed(futures): - cleaned_result.append(future.result()) + result = future.result() + if len(result["content"]) == 0: + logging.error( + "No content found in the cleaned result for slide %s.", + result["pageNumber"], + ) + else: + cleaned_result.append(result) else: markdown_content = result.content