From 7029a1843ff3a9713c1011f260e5e5db1db1cf8a Mon Sep 17 00:00:00 2001 From: priyal1508 <54278892+priyal1508@users.noreply.github.com> Date: Thu, 5 Sep 2024 10:52:29 +0530 Subject: [PATCH 01/33] changes for adi based skillset --- aisearch-skillset/ai_search.py | 690 ++++++++++++++++++ aisearch-skillset/deploy.py | 80 ++ aisearch-skillset/environment.py | 192 +++++ aisearch-skillset/inquiry_document.py | 320 ++++++++ function_apps/common/ai_search.py | 127 ++++ function_apps/indexer/adi_2_aisearch.py | 460 ++++++++++++ function_apps/indexer/function_app.py | 296 ++++++++ .../indexer/key_phrase_extraction.py | 112 +++ .../indexer/pre_embedding_cleaner.py | 144 ++++ function_apps/indexer/requirements.txt | 26 + 10 files changed, 2447 insertions(+) create mode 100644 aisearch-skillset/ai_search.py create mode 100644 aisearch-skillset/deploy.py create mode 100644 aisearch-skillset/environment.py create mode 100644 aisearch-skillset/inquiry_document.py create mode 100644 function_apps/common/ai_search.py create mode 100644 function_apps/indexer/adi_2_aisearch.py create mode 100644 function_apps/indexer/function_app.py create mode 100644 function_apps/indexer/key_phrase_extraction.py create mode 100644 function_apps/indexer/pre_embedding_cleaner.py create mode 100644 function_apps/indexer/requirements.txt diff --git a/aisearch-skillset/ai_search.py b/aisearch-skillset/ai_search.py new file mode 100644 index 0000000..7573055 --- /dev/null +++ b/aisearch-skillset/ai_search.py @@ -0,0 +1,690 @@ +from abc import ABC, abstractmethod +from azure.search.documents.indexes.models import ( + SearchIndex, + SearchableField, + VectorSearch, + VectorSearchProfile, + HnswAlgorithmConfiguration, + SemanticSearch, + NativeBlobSoftDeleteDeletionDetectionPolicy, + HighWaterMarkChangeDetectionPolicy, + WebApiSkill, + CustomVectorizer, + CustomWebApiParameters, + SearchIndexer, + SearchIndexerSkillset, + SearchIndexerDataContainer, + SearchIndexerDataSourceConnection, + SearchIndexerDataSourceType, + SearchIndexerDataUserAssignedIdentity, + OutputFieldMappingEntry, + InputFieldMappingEntry, + SynonymMap, + DocumentExtractionSkill, + OcrSkill, + MergeSkill, + ConditionalSkill, + SplitSkill, +) +from azure.core.exceptions import HttpResponseError +from azure.search.documents.indexes import SearchIndexerClient, SearchIndexClient +from environment import ( + get_fq_blob_connection_string, + get_blob_container_name, + get_custom_skill_function_url, + get_managed_identity_fqname, + get_function_app_authresourceid, + IndexerType, +) + + +class AISearch(ABC): + def __init__( + self, + endpoint: str, + credential, + suffix: str | None = None, + rebuild: bool | None = False, + ): + """Initialize the AI search class + + Args: + endpoint (str): The search endpoint + credential (AzureKeyCredential): The search credential""" + self.indexer_type = None + + if rebuild is not None: + self.rebuild = rebuild + else: + self.rebuild = False + + if suffix is None: + self.suffix = "" + self.test = False + else: + self.suffix = f"-{suffix}-test" + self.test = True + + self._search_indexer_client = SearchIndexerClient(endpoint, credential) + self._search_index_client = SearchIndexClient(endpoint, credential) + + @property + def indexer_name(self): + return f"{str(self.indexer_type.value)}-indexer{self.suffix}" + + @property + def skillset_name(self): + return f"{str(self.indexer_type.value)}-skillset{self.suffix}" + + @property + def semantic_config_name(self): + return f"{str(self.indexer_type.value)}-semantic-config{self.suffix}" + + @property + def index_name(self): + return f"{str(self.indexer_type.value)}-index{self.suffix}" + + @property + def data_source_name(self): + blob_container_name = get_blob_container_name(self.indexer_type) + return f"{blob_container_name}-data-source{self.suffix}" + + @property + def vector_search_profile_name(self): + return ( + f"{str(self.indexer_type.value)}-compass-vector-search-profile{self.suffix}" + ) + + @abstractmethod + def get_index_fields(self) -> list[SearchableField]: + """Get the index fields for the indexer. + + Returns: + list[SearchableField]: The index fields""" + + @abstractmethod + def get_semantic_search(self) -> SemanticSearch: + """Get the semantic search configuration for the indexer. + + Returns: + SemanticSearch: The semantic search configuration""" + + @abstractmethod + def get_skills(self): + """Get the skillset for the indexer.""" + + @abstractmethod + def get_indexer(self) -> SearchIndexer: + """Get the indexer for the indexer.""" + + def get_index_projections(self): + """Get the index projections for the indexer.""" + return None + + def get_synonym_map_names(self): + return [] + + def get_user_assigned_managed_identity( + self, + ) -> SearchIndexerDataUserAssignedIdentity: + """Get user assigned managed identity details""" + + user_assigned_identity = SearchIndexerDataUserAssignedIdentity( + user_assigned_identity=get_managed_identity_fqname() + ) + return user_assigned_identity + + def get_data_source(self) -> SearchIndexerDataSourceConnection: + """Get the data source for the indexer.""" + + if self.indexer_type == IndexerType.BUSINESS_GLOSSARY: + data_deletion_detection_policy = None + else: + data_deletion_detection_policy = ( + NativeBlobSoftDeleteDeletionDetectionPolicy() + ) + + data_change_detection_policy = HighWaterMarkChangeDetectionPolicy( + high_water_mark_column_name="metadata_storage_last_modified" + ) + + container = SearchIndexerDataContainer( + name=get_blob_container_name(self.indexer_type) + ) + + data_source_connection = SearchIndexerDataSourceConnection( + name=self.data_source_name, + type=SearchIndexerDataSourceType.AZURE_BLOB, + connection_string=get_fq_blob_connection_string(), + container=container, + data_change_detection_policy=data_change_detection_policy, + data_deletion_detection_policy=data_deletion_detection_policy, + identity=self.get_user_assigned_managed_identity(), + ) + + return data_source_connection + + def get_compass_vector_custom_skill( + self, context, source, target_name="vector" + ) -> WebApiSkill: + """Get the custom skill for compass. + + Args: + ----- + context (str): The context of the skill + source (str): The source of the skill + target_name (str): The target name of the skill + + Returns: + -------- + WebApiSkill: The custom skill for compass""" + + if self.test: + batch_size = 2 + degree_of_parallelism = 2 + else: + batch_size = 4 + degree_of_parallelism = 8 + + embedding_skill_inputs = [ + InputFieldMappingEntry(name="text", source=source), + ] + embedding_skill_outputs = [ + OutputFieldMappingEntry(name="vector", target_name=target_name) + ] + # Limit the number of documents to be processed in parallel to avoid timing out on compass api + embedding_skill = WebApiSkill( + name="Compass Connector API", + description="Skill to generate embeddings via compass API connector", + context=context, + uri=get_custom_skill_function_url("compass"), + timeout="PT230S", + batch_size=batch_size, + degree_of_parallelism=degree_of_parallelism, + http_method="POST", + inputs=embedding_skill_inputs, + outputs=embedding_skill_outputs, + auth_resource_id=get_function_app_authresourceid(), + auth_identity=self.get_user_assigned_managed_identity(), + ) + + return embedding_skill + + def get_pre_embedding_cleaner_skill( + self, context, source, chunk_by_page=False, target_name="cleaned_chunk" + ) -> WebApiSkill: + """Get the custom skill for data cleanup. + + Args: + ----- + context (str): The context of the skill + inputs (List[InputFieldMappingEntry]): The inputs of the skill + outputs (List[OutputFieldMappingEntry]): The outputs of the skill + + Returns: + -------- + WebApiSkill: The custom skill for data cleanup""" + + if self.test: + batch_size = 2 + degree_of_parallelism = 2 + else: + batch_size = 16 + degree_of_parallelism = 16 + + pre_embedding_cleaner_skill_inputs = [ + InputFieldMappingEntry(name="chunk", source=source) + ] + + pre_embedding_cleaner_skill_outputs = [ + OutputFieldMappingEntry(name="cleaned_chunk", target_name=target_name), + OutputFieldMappingEntry(name="chunk", target_name="chunk"), + OutputFieldMappingEntry(name="section", target_name="eachsection"), + ] + + if chunk_by_page: + pre_embedding_cleaner_skill_outputs.extend( + [ + OutputFieldMappingEntry(name="page_number", target_name="page_no"), + ] + ) + + pre_embedding_cleaner_skill = WebApiSkill( + name="Pre Embedding Cleaner Skill", + description="Skill to clean the data before sending to embedding", + context=context, + uri=get_custom_skill_function_url("pre_embedding_cleaner"), + timeout="PT230S", + batch_size=batch_size, + degree_of_parallelism=degree_of_parallelism, + http_method="POST", + inputs=pre_embedding_cleaner_skill_inputs, + outputs=pre_embedding_cleaner_skill_outputs, + auth_resource_id=get_function_app_authresourceid(), + auth_identity=self.get_user_assigned_managed_identity(), + ) + + return pre_embedding_cleaner_skill + + def get_text_split_skill(self, context, source) -> SplitSkill: + """Get the skill for text split. + + Args: + ----- + context (str): The context of the skill + inputs (List[InputFieldMappingEntry]): The inputs of the skill + outputs (List[OutputFieldMappingEntry]): The outputs of the skill + + Returns: + -------- + splitSKill: The skill for text split""" + + text_split_skill = SplitSkill( + name="Text Split Skill", + description="Skill to split the text before sending to embedding", + context=context, + text_split_mode="pages", + maximum_page_length=2000, + page_overlap_length=500, + inputs=[InputFieldMappingEntry(name="text", source=source)], + outputs=[OutputFieldMappingEntry(name="textItems", target_name="pages")], + ) + + return text_split_skill + + def get_custom_text_split_skill( + self, + context, + source, + text_split_mode="semantic", + maximum_page_length=1000, + separator=" ", + initial_threshold=0.7, + appending_threshold=0.6, + merging_threshold=0.6, + ) -> WebApiSkill: + """Get the custom skill for text split. + + Args: + ----- + context (str): The context of the skill + inputs (List[InputFieldMappingEntry]): The inputs of the skill + outputs (List[OutputFieldMappingEntry]): The outputs of the skill + + Returns: + -------- + WebApiSkill: The custom skill for text split""" + + if self.test: + batch_size = 2 + degree_of_parallelism = 2 + else: + batch_size = 2 + degree_of_parallelism = 6 + + text_split_skill_inputs = [ + InputFieldMappingEntry(name="text", source=source), + ] + + headers = { + "text_split_mode": text_split_mode, + "maximum_page_length": maximum_page_length, + "separator": separator, + "initial_threshold": initial_threshold, + "appending_threshold": appending_threshold, + "merging_threshold": merging_threshold, + } + + text_split_skill = WebApiSkill( + name="Text Split Skill", + description="Skill to split the text before sending to embedding", + context=context, + uri=get_custom_skill_function_url("split"), + timeout="PT230S", + batch_size=batch_size, + degree_of_parallelism=degree_of_parallelism, + http_method="POST", + http_headers=headers, + inputs=text_split_skill_inputs, + outputs=[OutputFieldMappingEntry(name="chunks", target_name="pages")], + auth_resource_id=get_function_app_authresourceid(), + auth_identity=self.get_user_assigned_managed_identity(), + ) + + return text_split_skill + + def get_adi_skill(self, chunk_by_page=False) -> WebApiSkill: + """Get the custom skill for adi. + + Returns: + -------- + WebApiSkill: The custom skill for adi""" + + if self.test: + batch_size = 1 + degree_of_parallelism = 4 + else: + batch_size = 1 + degree_of_parallelism = 16 + + if chunk_by_page: + output = [ + OutputFieldMappingEntry(name="extracted_content", target_name="pages") + ] + else: + output = [ + OutputFieldMappingEntry( + name="extracted_content", target_name="extracted_content" + ) + ] + + adi_skill = WebApiSkill( + name="ADI Skill", + description="Skill to generate ADI", + context="/document", + uri=get_custom_skill_function_url("adi"), + timeout="PT230S", + batch_size=batch_size, + degree_of_parallelism=degree_of_parallelism, + http_method="POST", + http_headers={"chunk_by_page": chunk_by_page}, + inputs=[ + InputFieldMappingEntry( + name="source", source="/document/metadata_storage_path" + ) + ], + outputs=output, + auth_resource_id=get_function_app_authresourceid(), + auth_identity=self.get_user_assigned_managed_identity(), + ) + + return adi_skill + + def get_key_phrase_extraction_skill(self, context, source) -> WebApiSkill: + """Get the key phrase extraction skill. + + Args: + ----- + context (str): The context of the skill + source (str): The source of the skill + + Returns: + -------- + WebApiSkill: The key phrase extraction skill""" + + if self.test: + batch_size = 4 + degree_of_parallelism = 4 + else: + batch_size = 16 + degree_of_parallelism = 16 + + keyphrase_extraction_skill_inputs = [ + InputFieldMappingEntry(name="text", source=source), + ] + keyphrase_extraction__skill_outputs = [ + OutputFieldMappingEntry(name="keyPhrases", target_name="keywords") + ] + key_phrase_extraction_skill = WebApiSkill( + name="Key phrase extraction API", + description="Skill to extract keyphrases", + context=context, + uri=get_custom_skill_function_url("keyphraseextraction"), + timeout="PT230S", + batch_size=batch_size, + degree_of_parallelism=degree_of_parallelism, + http_method="POST", + inputs=keyphrase_extraction_skill_inputs, + outputs=keyphrase_extraction__skill_outputs, + auth_resource_id=get_function_app_authresourceid(), + auth_identity=self.get_user_assigned_managed_identity(), + ) + + return key_phrase_extraction_skill + + def get_document_extraction_skill(self, context, source) -> DocumentExtractionSkill: + """Get the document extraction utility skill. + + Args: + ----- + context (str): The context of the skill + source (str): The source of the skill + + Returns: + -------- + DocumentExtractionSkill: The document extraction utility skill""" + + doc_extraction_skill = DocumentExtractionSkill( + description="Extraction skill to extract content from office docs like excel, ppt, doc etc", + context=context, + inputs=[InputFieldMappingEntry(name="file_data", source=source)], + outputs=[ + OutputFieldMappingEntry( + name="content", target_name="extracted_content" + ), + OutputFieldMappingEntry( + name="normalized_images", target_name="extracted_normalized_images" + ), + ], + ) + + return doc_extraction_skill + + def get_ocr_skill(self, context, source) -> OcrSkill: + """Get the ocr utility skill + Args: + ----- + context (str): The context of the skill + source (str): The source of the skill + + Returns: + -------- + OcrSkill: The ocr skill""" + + if self.test: + batch_size = 2 + degree_of_parallelism = 2 + else: + batch_size = 2 + degree_of_parallelism = 2 + + ocr_skill_inputs = [ + InputFieldMappingEntry(name="image", source=source), + ] + ocr__skill_outputs = [OutputFieldMappingEntry(name="text", target_name="text")] + ocr_skill = WebApiSkill( + name="ocr API", + description="Skill to extract text from images", + context=context, + uri=get_custom_skill_function_url("ocr"), + timeout="PT230S", + batch_size=batch_size, + degree_of_parallelism=degree_of_parallelism, + http_method="POST", + inputs=ocr_skill_inputs, + outputs=ocr__skill_outputs, + auth_resource_id=get_function_app_authresourceid(), + auth_identity=self.get_user_assigned_managed_identity(), + ) + + return ocr_skill + + def get_merge_skill(self, context, source) -> MergeSkill: + """Get the merge + Args: + ----- + context (str): The context of the skill + source (array): The source of the skill + + Returns: + -------- + mergeSkill: The merge skill""" + + merge_skill = MergeSkill( + description="Merge skill for combining OCR'd and regular text", + context=context, + inputs=[ + InputFieldMappingEntry(name="text", source=source[0]), + InputFieldMappingEntry(name="itemsToInsert", source=source[1]), + InputFieldMappingEntry(name="offsets", source=source[2]), + ], + outputs=[ + OutputFieldMappingEntry(name="mergedText", target_name="merged_content") + ], + ) + + return merge_skill + + def get_conditional_skill(self, context, source) -> ConditionalSkill: + """Get the merge + Args: + ----- + context (str): The context of the skill + source (array): The source of the skill + + Returns: + -------- + ConditionalSkill: The conditional skill""" + + conditional_skill = ConditionalSkill( + description="Select between OCR and Document Extraction output", + context=context, + inputs=[ + InputFieldMappingEntry(name="condition", source=source[0]), + InputFieldMappingEntry(name="whenTrue", source=source[1]), + InputFieldMappingEntry(name="whenFalse", source=source[2]), + ], + outputs=[ + OutputFieldMappingEntry(name="output", target_name="updated_content") + ], + ) + + return conditional_skill + + def get_compass_vector_search(self) -> VectorSearch: + """Get the vector search configuration for compass. + + Args: + indexer_type (str): The type of the indexer + + Returns: + VectorSearch: The vector search configuration + """ + vectorizer_name = ( + f"{str(self.indexer_type.value)}-compass-vectorizer{self.suffix}" + ) + algorithim_name = f"{str(self.indexer_type.value)}-hnsw-algorithm{self.suffix}" + + vector_search = VectorSearch( + algorithms=[ + HnswAlgorithmConfiguration(name=algorithim_name), + ], + profiles=[ + VectorSearchProfile( + name=self.vector_search_profile_name, + algorithm_configuration_name=algorithim_name, + vectorizer=vectorizer_name, + ) + ], + vectorizers=[ + CustomVectorizer( + name=vectorizer_name, + custom_web_api_parameters=CustomWebApiParameters( + uri=get_custom_skill_function_url("compass"), + auth_resource_id=get_function_app_authresourceid(), + auth_identity=self.get_user_assigned_managed_identity(), + ), + ), + ], + ) + + return vector_search + + def deploy_index(self): + """This function deploys index""" + + index_fields = self.get_index_fields() + vector_search = self.get_compass_vector_search() + semantic_search = self.get_semantic_search() + index = SearchIndex( + name=self.index_name, + fields=index_fields, + vector_search=vector_search, + semantic_search=semantic_search, + ) + if self.rebuild: + self._search_index_client.delete_index(self.index_name) + self._search_index_client.create_or_update_index(index) + + print(f"{index.name} created") + + def deploy_skillset(self): + """This function deploys the skillset.""" + skills = self.get_skills() + index_projections = self.get_index_projections() + + skillset = SearchIndexerSkillset( + name=self.skillset_name, + description="Skillset to chunk documents and generating embeddings", + skills=skills, + index_projections=index_projections, + ) + + self._search_indexer_client.create_or_update_skillset(skillset) + print(f"{skillset.name} created") + + def deploy_data_source(self): + """This function deploys the data source.""" + data_source = self.get_data_source() + + result = self._search_indexer_client.create_or_update_data_source_connection( + data_source + ) + + print(f"Data source '{result.name}' created or updated") + + return result + + def deploy_indexer(self): + """This function deploys the indexer.""" + indexer = self.get_indexer() + + result = self._search_indexer_client.create_or_update_indexer(indexer) + + print(f"Indexer '{result.name}' created or updated") + + return result + + def run_indexer(self): + """This function runs the indexer.""" + self._search_indexer_client.run_indexer(self.indexer_name) + + print( + f"{self.indexer_name} is running. If queries return no results, please wait a bit and try again." + ) + + def reset_indexer(self): + """This function runs the indexer.""" + self._search_indexer_client.reset_indexer(self.indexer_name) + + print(f"{self.indexer_name} reset.") + + def deploy_synonym_map(self) -> list[SearchableField]: + synonym_maps = self.get_synonym_map_names() + if len(synonym_maps) > 0: + for synonym_map in synonym_maps: + try: + synonym_map = SynonymMap(name=synonym_map, synonyms="") + self._search_index_client.create_synonym_map(synonym_map) + except HttpResponseError: + print("Unable to deploy synonym map as it already exists.") + + def deploy(self): + """This function deploys the whole AI search pipeline.""" + self.deploy_data_source() + self.deploy_synonym_map() + self.deploy_index() + self.deploy_skillset() + self.deploy_indexer() + + print(f"{str(self.indexer_type.value)} deployed") diff --git a/aisearch-skillset/deploy.py b/aisearch-skillset/deploy.py new file mode 100644 index 0000000..d98e099 --- /dev/null +++ b/aisearch-skillset/deploy.py @@ -0,0 +1,80 @@ +import argparse +from environment import get_search_endpoint, get_managed_identity_id, get_search_key,get_key_vault_url +from azure.core.credentials import AzureKeyCredential +from azure.identity import DefaultAzureCredential,ManagedIdentityCredential,EnvironmentCredential +from azure.keyvault.secrets import SecretClient +from inquiry_document import InquiryDocumentAISearch + + +def main(args): + endpoint = get_search_endpoint() + + try: + credential = DefaultAzureCredential(managed_identity_client_id =get_managed_identity_id()) + # initializing key vault client + client = SecretClient(vault_url=get_key_vault_url(), credential=credential) + print("Using managed identity credential") + except Exception as e: + print(e) + credential = ( + AzureKeyCredential(get_search_key(client=client)) + ) + print("Using Azure Key credential") + + if args.indexer_type == "inquiry": + # Deploy the inquiry index + index_config = InquiryDocumentAISearch( + endpoint=endpoint, + credential=credential, + suffix=args.suffix, + rebuild=args.rebuild, + enable_page_by_chunking=args.enable_page_chunking + ) + elif args.indexer_type == "summary": + # Deploy the summarises index + index_config = SummaryDocumentAISearch( + endpoint=endpoint, + credential=credential, + suffix=args.suffix, + rebuild=args.rebuild, + enable_page_by_chunking=args.enable_page_chunking + ) + elif args.indexer_type == "glossary": + # Deploy business glossary index + index_config = BusinessGlossaryAISearch(endpoint, credential) + + index_config.deploy() + + if args.rebuild: + index_config.reset_indexer() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process some arguments.") + parser.add_argument( + "--indexer_type", + type=str, + required=True, + help="Type of Indexer want to deploy. inquiry/summary/glossary", + ) + parser.add_argument( + "--rebuild", + type=bool, + required=False, + help="Whether want to delete and rebuild the index", + ) + parser.add_argument( + "--enable_page_chunking", + type=bool, + required=False, + help="Whether want to enable chunking by page in adi skill, if no value is passed considered False", + ) + parser.add_argument( + "--suffix", + type=str, + required=False, + help="Suffix to be attached to indexer objects", + ) + + args = parser.parse_args() + main(args) diff --git a/aisearch-skillset/environment.py b/aisearch-skillset/environment.py new file mode 100644 index 0000000..7503a68 --- /dev/null +++ b/aisearch-skillset/environment.py @@ -0,0 +1,192 @@ +"""Module providing environment definition""" +import os +from dotenv import find_dotenv, load_dotenv +from enum import Enum + +load_dotenv(find_dotenv()) + + +class IndexerType(Enum): + """The type of the indexer""" + + INQUIRY_DOCUMENT = "inquiry-document" + SUMMARY_DOCUMENT = "summary-document" + BUSINESS_GLOSSARY = "business-glossary" + +# key vault +def get_key_vault_url() ->str: + """ + This function returns key vault url + """ + return os.environ.get("KeyVault__Url") + +# managed identity id +def get_managed_identity_id() -> str: + """ + This function returns maanged identity id + """ + return os.environ.get("AIService__AzureSearchOptions__ManagedIdentity__ClientId") + + +def get_managed_identity_fqname() -> str: + """ + This function returns maanged identity name + """ + return os.environ.get("AIService__AzureSearchOptions__ManagedIdentity__FQName") + + +# function app details +def get_function_app_authresourceid() -> str: + """ + This function returns apps registration in microsoft entra id + """ + return os.environ.get("FunctionApp__AuthResourceId") + + +def get_function_app_end_point() -> str: + """ + This function returns function app endpoint + """ + return os.environ.get("FunctionApp__Endpoint") + +def get_function_app_key() -> str: + """ + This function returns function app key + """ + return os.environ.get("FunctionApp__Key") + +def get_function_app_compass_function() -> str: + """ + This function returns function app compass function name + """ + return os.environ.get("FunctionApp__Compass__FunctionName") + + +def get_function_app_pre_embedding_cleaner_function() -> str: + """ + This function returns function app data cleanup function name + """ + return os.environ.get("FunctionApp__PreEmbeddingCleaner__FunctionName") + + +def get_function_app_adi_function() -> str: + """ + This function returns function app adi name + """ + return os.environ.get("FunctionApp__DocumentIntelligence__FunctionName") + + +def get_function_app_custom_split_function() -> str: + """ + This function returns function app adi name + """ + return os.environ.get("FunctionApp__CustomTextSplit__FunctionName") + + +def get_function_app_keyphrase_extractor_function() -> str: + """ + This function returns function app keyphrase extractor name + """ + return os.environ.get("FunctionApp__KeyphraseExtractor__FunctionName") + + +def get_function_app_ocr_function() -> str: + """ + This function returns function app ocr name + """ + return os.environ.get("FunctionApp__Ocr__FunctionName") + + +# search +def get_search_endpoint() -> str: + """ + This function returns azure ai search service endpoint + """ + return os.environ.get("AIService__AzureSearchOptions__Endpoint") + + +def get_search_user_assigned_identity() -> str: + """ + This function returns azure ai search service endpoint + """ + return os.environ.get("AIService__AzureSearchOptions__UserAssignedIdentity") + + +def get_search_key(client) -> str: + """ + This function returns azure ai search service admin key + """ + search_service_key_secret_name = str(os.environ.get("AIService__AzureSearchOptions__name")) + "-PrimaryKey" + retrieved_secret = client.get_secret(search_service_key_secret_name) + return retrieved_secret.value + +def get_search_key_secret() -> str: + """ + This function returns azure ai search service admin key + """ + return os.environ.get("AIService__AzureSearchOptions__Key__Secret") + + +def get_search_embedding_model_dimensions(indexer_type: IndexerType) -> str: + """ + This function returns dimensions for embedding model + """ + + normalised_indexer_type = ( + indexer_type.value.replace("-", " ").title().replace(" ", "") + ) + + return os.environ.get( + f"AIService__AzureSearchOptions__{normalised_indexer_type}__EmbeddingDimensions" + ) + +def get_blob_connection_string() -> str: + """ + This function returns azure blob storage connection string + """ + return os.environ.get("StorageAccount__ConnectionString") + +def get_fq_blob_connection_string() -> str: + """ + This function returns azure blob storage connection string + """ + return os.environ.get("StorageAccount__FQEndpoint") + + +def get_blob_container_name(indexer_type: str) -> str: + """ + This function returns azure blob container name + """ + normalised_indexer_type = ( + indexer_type.value.replace("-", " ").title().replace(" ", "") + ) + return os.environ.get(f"StorageAccount__{normalised_indexer_type}__Container") + + +def get_custom_skill_function_url(skill_type: str): + """ + Get the function app url that is hosting the custom skill + """ + url = ( + get_function_app_end_point() + + "/api/function_name?code=" + + get_function_app_key() + ) + if skill_type == "compass": + url = url.replace("function_name", get_function_app_compass_function()) + elif skill_type == "pre_embedding_cleaner": + url = url.replace( + "function_name", get_function_app_pre_embedding_cleaner_function() + ) + elif skill_type == "adi": + url = url.replace("function_name", get_function_app_adi_function()) + elif skill_type == "split": + url = url.replace("function_name", get_function_app_custom_split_function()) + elif skill_type == "keyphraseextraction": + url = url.replace( + "function_name", get_function_app_keyphrase_extractor_function() + ) + elif skill_type == "ocr": + url = url.replace("function_name", get_function_app_ocr_function()) + + return url diff --git a/aisearch-skillset/inquiry_document.py b/aisearch-skillset/inquiry_document.py new file mode 100644 index 0000000..3f9dd0a --- /dev/null +++ b/aisearch-skillset/inquiry_document.py @@ -0,0 +1,320 @@ +from azure.search.documents.indexes.models import ( + SearchFieldDataType, + SearchField, + SearchableField, + SemanticField, + SemanticPrioritizedFields, + SemanticConfiguration, + SemanticSearch, + InputFieldMappingEntry, + SearchIndexer, + FieldMapping, + IndexingParameters, + IndexingParametersConfiguration, + BlobIndexerImageAction, + SearchIndexerIndexProjections, + SearchIndexerIndexProjectionSelector, + SearchIndexerIndexProjectionsParameters, + IndexProjectionMode, + SimpleField, + BlobIndexerDataToExtract, + IndexerExecutionEnvironment, + BlobIndexerPDFTextRotationAlgorithm, +) +from ai_search import AISearch +from environment import ( + get_search_embedding_model_dimensions, + IndexerType, +) + + +class InquiryDocumentAISearch(AISearch): + """This class is used to deploy the inquiry document index.""" + + def __init__( + self, + endpoint, + credential, + suffix=None, + rebuild=False, + enable_page_by_chunking=False, + ): + super().__init__(endpoint, credential, suffix, rebuild) + + self.indexer_type = IndexerType.INQUIRY_DOCUMENT + if enable_page_by_chunking is not None: + self.enable_page_by_chunking = enable_page_by_chunking + else: + self.enable_page_by_chunking = False + + # explicitly setting it to false no matter what output comes in + # might be removed later + # self.enable_page_by_chunking = False + + def get_index_fields(self) -> list[SearchableField]: + """This function returns the index fields for inquiry document. + + Returns: + list[SearchableField]: The index fields for inquiry document""" + + fields = [ + SimpleField(name="Id", type=SearchFieldDataType.String, filterable=True), + SearchableField( + name="Title", type=SearchFieldDataType.String, filterable=True + ), + SearchableField( + name="DealId", + type=SearchFieldDataType.String, + sortable=True, + filterable=True, + facetable=True, + ), + SearchableField( + name="OracleId", + type=SearchFieldDataType.String, + sortable=True, + filterable=True, + facetable=True, + ), + SearchableField( + name="ChunkId", + type=SearchFieldDataType.String, + key=True, + analyzer_name="keyword", + ), + SearchableField( + name="Chunk", + type=SearchFieldDataType.String, + sortable=False, + filterable=False, + facetable=False, + ), + SearchableField( + name="Section", + type=SearchFieldDataType.String, + collection=True, + ), + SearchField( + name="ChunkEmbedding", + type=SearchFieldDataType.Collection(SearchFieldDataType.Single), + vector_search_dimensions=get_search_embedding_model_dimensions( + self.indexer_type + ), + vector_search_profile_name=self.vector_search_profile_name, + ), + SearchableField( + name="Keywords", type=SearchFieldDataType.String, collection=True + ), + SearchableField( + name="SourceUrl", + type=SearchFieldDataType.String, + sortable=True, + filterable=True, + facetable=True, + ), + SearchableField( + name="AdditionalMetadata", + type=SearchFieldDataType.String, + sortable=True, + filterable=True, + facetable=True, + ), + ] + + if self.enable_page_by_chunking: + fields.extend( + [ + SearchableField( + name="PageNumber", + type=SearchFieldDataType.Int64, + sortable=True, + filterable=True, + facetable=True, + ) + ] + ) + + return fields + + def get_semantic_search(self) -> SemanticSearch: + """This function returns the semantic search configuration for inquiry document + + Returns: + SemanticSearch: The semantic search configuration""" + + semantic_config = SemanticConfiguration( + name=self.semantic_config_name, + prioritized_fields=SemanticPrioritizedFields( + title_field=SemanticField(field_name="Title"), + content_fields=[SemanticField(field_name="Chunk")], + keywords_fields=[ + SemanticField(field_name="Keywords"), + SemanticField(field_name="Section"), + ], + ), + ) + + semantic_search = SemanticSearch(configurations=[semantic_config]) + + return semantic_search + + def get_skills(self): + """This function returns the skills for inquiry document""" + + adi_skill = self.get_adi_skill(self.enable_page_by_chunking) + + text_split_skill = self.get_text_split_skill( + "/document", "/document/extracted_content/content" + ) + + pre_embedding_cleaner_skill = self.get_pre_embedding_cleaner_skill( + "/document/pages/*", "/document/pages/*", self.enable_page_by_chunking + ) + + key_phrase_extraction_skill = self.get_key_phrase_extraction_skill( + "/document/pages/*", "/document/pages/*/cleaned_chunk" + ) + + embedding_skill = self.get_compass_vector_custom_skill( + "/document/pages/*", "/document/pages/*/cleaned_chunk" + ) + + if self.enable_page_by_chunking: + skills = [ + adi_skill, + pre_embedding_cleaner_skill, + key_phrase_extraction_skill, + embedding_skill, + ] + else: + skills = [ + adi_skill, + text_split_skill, + pre_embedding_cleaner_skill, + key_phrase_extraction_skill, + embedding_skill, + ] + + return skills + + def get_index_projections(self) -> SearchIndexerIndexProjections: + """This function returns the index projections for inquiry document.""" + mappings =[ + InputFieldMappingEntry( + name="Chunk", source="/document/pages/*/chunk" + ), + InputFieldMappingEntry( + name="ChunkEmbedding", + source="/document/pages/*/vector", + ), + InputFieldMappingEntry( + name="Title", + source="/document/Title" + ), + InputFieldMappingEntry( + name="DealId", + source="/document/DealId" + ), + InputFieldMappingEntry( + name="OracleId", + source="/document/OracleId" + ), + InputFieldMappingEntry( + name="SourceUrl", + source="/document/SourceUrl" + ), + InputFieldMappingEntry( + name="Keywords", + source="/document/pages/*/keywords" + ), + InputFieldMappingEntry( + name="AdditionalMetadata", + source="/document/AdditionalMetadata", + ), + InputFieldMappingEntry( + name="Section", + source="/document/pages/*/eachsection" + ) + ] + + if self.enable_page_by_chunking: + mappings.extend( + [ + InputFieldMappingEntry( + name="PageNumber", source="/document/pages/*/page_no" + ) + ] + ) + + index_projections = SearchIndexerIndexProjections( + selectors=[ + SearchIndexerIndexProjectionSelector( + target_index_name=self.index_name, + parent_key_field_name="Id", + source_context="/document/pages/*", + mappings=mappings + ), + ], + parameters=SearchIndexerIndexProjectionsParameters( + projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS + ), + ) + + return index_projections + + def get_indexer(self) -> SearchIndexer: + """This function returns the indexer for inquiry document. + + Returns: + SearchIndexer: The indexer for inquiry document""" + if self.test: + schedule = None + batch_size = 4 + else: + schedule = {"interval": "PT15M"} + batch_size = 16 + + indexer_parameters = IndexingParameters( + batch_size=batch_size, + configuration=IndexingParametersConfiguration( + # image_action=BlobIndexerImageAction.GENERATE_NORMALIZED_IMAGE_PER_PAGE, + data_to_extract=BlobIndexerDataToExtract.ALL_METADATA, + query_timeout=None, + # allow_skillset_to_read_file_data=True, + execution_environment=IndexerExecutionEnvironment.PRIVATE, + # pdf_text_rotation_algorithm=BlobIndexerPDFTextRotationAlgorithm.DETECT_ANGLES, + fail_on_unprocessable_document=False, + fail_on_unsupported_content_type=False, + index_storage_metadata_only_for_oversized_documents=True, + indexed_file_name_extensions=".pdf,.pptx,.docx", + ), + max_failed_items=5, + ) + + indexer = SearchIndexer( + name=self.indexer_name, + description="Indexer to index documents and generate embeddings", + skillset_name=self.skillset_name, + target_index_name=self.index_name, + data_source_name=self.data_source_name, + schedule=schedule, + field_mappings=[ + FieldMapping( + source_field_name="metadata_storage_name", target_field_name="Title" + ), + FieldMapping(source_field_name="Deal_ID", target_field_name="DealId"), + FieldMapping( + source_field_name="Oracle_ID", target_field_name="OracleId" + ), + FieldMapping( + source_field_name="SharePointUrl", target_field_name="SourceUrl" + ), + FieldMapping( + source_field_name="Additional_Metadata", + target_field_name="AdditionalMetadata", + ), + ], + parameters=indexer_parameters, + ) + + return indexer diff --git a/function_apps/common/ai_search.py b/function_apps/common/ai_search.py new file mode 100644 index 0000000..1bba829 --- /dev/null +++ b/function_apps/common/ai_search.py @@ -0,0 +1,127 @@ +from azure.search.documents.indexes.aio import SearchIndexerClient, SearchIndexClient +from azure.search.documents.aio import SearchClient +from azure.search.documents.indexes.models import SynonymMap +from azure.identity import DefaultAzureCredential +from azure.core.exceptions import HttpResponseError +import logging +import os +from enum import Enum +from openai import AsyncAzureOpenAI +from azure.search.documents.models import VectorizedQuery + + +class IndexerStatusEnum(Enum): + RETRIGGER = "RETRIGGER" + RUNNING = "RUNNING" + SUCCESS = "SUCCESS" + + +class AISearchHelper: + def __init__(self): + self._client_id = os.environ["FunctionApp__ClientId"] + + self._endpoint = os.environ["AIService__AzureSearchOptions__Endpoint"] + + async def get_index_client(self): + credential = DefaultAzureCredential(managed_identity_client_id=self._client_id) + + return SearchIndexClient(self._endpoint, credential) + + async def get_indexer_client(self): + credential = DefaultAzureCredential(managed_identity_client_id=self._client_id) + + return SearchIndexerClient(self._endpoint, credential) + + async def get_search_client(self, index_name): + credential = DefaultAzureCredential(managed_identity_client_id=self._client_id) + + return SearchClient(self._endpoint, index_name, credential) + + async def upload_synonym_map(self, synonym_map_name: str, synonyms: str): + index_client = await self.get_index_client() + async with index_client: + try: + await index_client.delete_synonym_map(synonym_map_name) + except HttpResponseError as e: + logging.error("Unable to delete synonym map %s", e) + + logging.info("Synonyms: %s", synonyms) + synonym_map = SynonymMap(name=synonym_map_name, synonyms=synonyms) + await index_client.create_synonym_map(synonym_map) + + async def get_indexer_status(self, indexer_name): + indexer_client = await self.get_indexer_client() + async with indexer_client: + try: + status = await indexer_client.get_indexer_status(indexer_name) + + last_execution_result = status.last_result + + if last_execution_result.status == "inProgress": + return IndexerStatusEnum.RUNNING, last_execution_result.start_time + elif last_execution_result.status in ["success", "transientFailure"]: + return IndexerStatusEnum.SUCCESS, last_execution_result.start_time + else: + return IndexerStatusEnum.RETRIGGER, last_execution_result.start_time + except HttpResponseError as e: + logging.error("Unable to get indexer status %s", e) + + async def trigger_indexer(self, indexer_name): + indexer_client = await self.get_indexer_client() + async with indexer_client: + try: + await indexer_client.run_indexer(indexer_name) + except HttpResponseError as e: + logging.error("Unable to run indexer %s", e) + + async def search_index( + self, index_name, semantic_config, search_text, deal_id=None + ): + """Search the index using the provided search text.""" + async with AsyncAzureOpenAI( + # This is the default and can be omitted + api_key=os.environ["AIService__Compass_Key"], + azure_endpoint=os.environ["AIService__Compass_Endpoint"], + api_version="2023-03-15-preview", + ) as open_ai_client: + embeddings = await open_ai_client.embeddings.create( + model=os.environ["AIService__Compass_Models__Embedding"], + input=search_text, + ) + + # Extract the embedding vector + embedding_vector = embeddings.data[0].embedding + + vector_query = VectorizedQuery( + vector=embedding_vector, + k_nearest_neighbors=5, + fields="ChunkEmbedding", + ) + + if deal_id: + filter_expression = f"DealId eq '{deal_id}'" + else: + filter_expression = None + + logging.info(f"Filter Expression: {filter_expression}") + + search_client = await self.get_search_client(index_name) + async with search_client: + results = await search_client.search( + top=3, + query_type="semantic", + semantic_configuration_name=semantic_config, + search_text=search_text, + select="Title,Chunk", + vector_queries=[vector_query], + filter=filter_expression, + ) + + documents = [ + document + async for result in results.by_page() + async for document in result + ] + + logging.info(f"Documents: {documents}") + return documents diff --git a/function_apps/indexer/adi_2_aisearch.py b/function_apps/indexer/adi_2_aisearch.py new file mode 100644 index 0000000..e0542fb --- /dev/null +++ b/function_apps/indexer/adi_2_aisearch.py @@ -0,0 +1,460 @@ +import base64 +from azure.core.credentials import AzureKeyCredential +from azure.ai.documentintelligence.aio import DocumentIntelligenceClient +from azure.ai.documentintelligence.models import AnalyzeResult, ContentFormat +import os +import re +import asyncio +import fitz +from PIL import Image +import io +import aiohttp +import logging +from common.storage_account import StorageAccountHelper +import concurrent.futures +import json + + +def crop_image_from_pdf_page(pdf_path, page_number, bounding_box): + """ + Crops a region from a given page in a PDF and returns it as an image. + + :param pdf_path: Path to the PDF file. + :param page_number: The page number to crop from (0-indexed). + :param bounding_box: A tuple of (x0, y0, x1, y1) coordinates for the bounding box. + :return: A PIL Image of the cropped area. + """ + doc = fitz.open(pdf_path) + page = doc.load_page(page_number) + + # Cropping the page. The rect requires the coordinates in the format (x0, y0, x1, y1). + bbx = [x * 72 for x in bounding_box] + rect = fitz.Rect(bbx) + pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72), clip=rect) + + img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) + + doc.close() + return img + + +def clean_adi_markdown(markdown_text: str, page_no:int,remove_irrelevant_figures=False): + """Clean Markdown text extracted by the Azure Document Intelligence service. + + Args: + ----- + markdown_text (str): The original Markdown text. + remove_irrelevant_figures (bool): Whether to remove all figures or just irrelevant ones. + + Returns: + -------- + str: The cleaned Markdown text. + """ + + # # Remove the page number comment + # page_number_pattern = r"" + # cleaned_text = re.sub(page_number_pattern, "", markdown_text) + + # # Replace the page header comment with its content + # page_header_pattern = r"" + # cleaned_text = re.sub( + # page_header_pattern, lambda match: match.group(1), cleaned_text + # ) + + # # Replace the page footer comment with its content + # page_footer_pattern = r"" + # cleaned_text = re.sub( + # page_footer_pattern, lambda match: match.group(1), cleaned_text + # ) + output_dict = {} + comment_patterns = r"||" + cleaned_text = re.sub(comment_patterns, "", markdown_text, flags=re.DOTALL) + + combined_pattern = r'(.*?)\n===|\n## ?(.*?)\n|\n### ?(.*?)\n' + doc_metadata = re.findall(combined_pattern, cleaned_text, re.DOTALL) + doc_metadata = [match for group in doc_metadata for match in group if match] + + + if remove_irrelevant_figures: + # Remove irrelevant figures + irrelevant_figure_pattern = ( + r"
.*?.*?
\s*" + ) + cleaned_text = re.sub( + irrelevant_figure_pattern, "", cleaned_text, flags=re.DOTALL + ) + + # Replace ':selected:' with a new line + cleaned_text = re.sub(r":(selected|unselected):", "\n", cleaned_text) + output_dict['content'] = cleaned_text + output_dict['section'] = doc_metadata + + # add page number when chunk by page is enabled + if page_no> -1: + output_dict['page_number'] = page_no + + return output_dict + + +def update_figure_description(md_content, img_description, idx): + """ + Updates the figure description in the Markdown content. + + Args: + md_content (str): The original Markdown content. + img_description (str): The new description for the image. + idx (int): The index of the figure. + + Returns: + str: The updated Markdown content with the new figure description. + """ + + # The substring you're looking for + start_substring = f"![](figures/{idx})" + end_substring = "" + new_string = f'' + + new_md_content = md_content + # Find the start and end indices of the part to replace + start_index = md_content.find(start_substring) + if start_index != -1: # if start_substring is found + start_index += len( + start_substring + ) # move the index to the end of start_substring + end_index = md_content.find(end_substring, start_index) + if end_index != -1: # if end_substring is found + # Replace the old string with the new string + new_md_content = ( + md_content[:start_index] + new_string + md_content[end_index:] + ) + + return new_md_content + + +async def understand_image_with_vlm(image_base64): + """ + Sends a base64-encoded image to a VLM (Vision Language Model) endpoint for financial analysis. + + Args: + image_base64 (str): The base64-encoded string representation of the image. + + Returns: + str: The response from the VLM, which is either a financial analysis or a statement indicating the image is not useful. + """ + # prompt = "Describe the image ONLY IF it is useful for financial analysis. Otherwise, say 'NOT USEFUL IMAGE' and NOTHING ELSE. " + prompt = "Perform financial analysis of the image ONLY IF the image is of graph, chart, flowchart or table. Otherwise, say 'NOT USEFUL IMAGE' and NOTHING ELSE. " + headers = {"Content-Type": "application/json"} + data = {"prompt": prompt, "image": image_base64} + vlm_endpoint = os.environ["AIServices__VLM__Endpoint"] + async with aiohttp.ClientSession() as session: + async with session.post( + vlm_endpoint, headers=headers, json=data, timeout=30 + ) as response: + response_data = await response.json() + response_text = response_data["response"].split("")[0] + + if ( + "not useful for financial analysis" in response_text + or "NOT USEFUL IMAGE" in response_text + ): + return "Irrelevant Image" + else: + return response_text + + +def pil_image_to_base64(image, image_format="JPEG"): + """ + Converts a PIL image to a base64-encoded string. + + Args: + image (PIL.Image.Image): The image to be converted. + image_format (str): The format to save the image in. Defaults to "JPEG". + + Returns: + str: The base64-encoded string representation of the image. + """ + if image.mode == "RGBA" and image_format == "JPEG": + image = image.convert("RGB") + buffered = io.BytesIO() + image.save(buffered, format=image_format) + return base64.b64encode(buffered.getvalue()).decode("utf-8") + + +async def process_figures_from_extracted_content( + file_path: str, markdown_content: str, figures: list, page_number: None | int = None +) -> str: + """Process the figures extracted from the content using ADI and send them for analysis. + + Args: + ----- + file_path (str): The path to the PDF file. + markdown_content (str): The extracted content in Markdown format. + figures (list): The list of figures extracted by the Azure Document Intelligence service. + page_number (int): The page number to process. If None, all pages are processed. + + Returns: + -------- + str: The updated Markdown content with the figure descriptions.""" + for idx, figure in enumerate(figures): + img_description = "" + logging.debug(f"Figure #{idx} has the following spans: {figure.spans}") + + caption_region = figure.caption.bounding_regions if figure.caption else [] + for region in figure.bounding_regions: + # Skip the region if it is not on the specified page + if page_number is not None and region.page_number != page_number: + continue + + if region not in caption_region: + # To learn more about bounding regions, see https://aka.ms/bounding-region + bounding_box = ( + region.polygon[0], # x0 (left) + region.polygon[1], # y0 (top) + region.polygon[4], # x1 (right) + region.polygon[5], # y1 (bottom) + ) + cropped_image = crop_image_from_pdf_page( + file_path, region.page_number - 1, bounding_box + ) # page_number is 1-indexed3 + + image_base64 = pil_image_to_base64(cropped_image) + + img_description += await understand_image_with_vlm(image_base64) + logging.info(f"\tDescription of figure {idx}: {img_description}") + + markdown_content = update_figure_description( + markdown_content, img_description, idx + ) + + return markdown_content + + +def create_page_wise_content(result: AnalyzeResult) -> list: + """Create a list of page-wise content extracted by the Azure Document Intelligence service. + + Args: + ----- + result (AnalyzeResult): The result of the document analysis. + + Returns: + -------- + list: A list of page-wise content extracted by the Azure Document Intelligence service. + """ + + page_wise_content = [] + page_numbers = [] + page_number = 0 + for page in result.pages: + page_content = result.content[ + page.spans[0]["offset"] : page.spans[0]["offset"] + page.spans[0]["length"] + ] + page_wise_content.append(page_content) + page_number+=1 + page_numbers.append(page_number) + + return page_wise_content,page_numbers + + +async def analyse_document(file_path: str) -> AnalyzeResult: + """Analyse a document using the Azure Document Intelligence service. + + Args: + ----- + file_path (str): The path to the document to analyse. + + Returns: + -------- + AnalyzeResult: The result of the document analysis.""" + with open(file_path, "rb") as f: + file_read = f.read() + # base64_encoded_file = base64.b64encode(file_read).decode("utf-8") + + async with DocumentIntelligenceClient( + endpoint=os.environ["AIService__Services__Endpoint"], + credential=AzureKeyCredential(os.environ["AIService__Services__Key"]), + ) as document_intelligence_client: + poller = await document_intelligence_client.begin_analyze_document( + model_id="prebuilt-layout", + analyze_request=file_read, + output_content_format=ContentFormat.MARKDOWN, + content_type="application/octet-stream", + ) + + result = await poller.result() + + if result is None or result.content is None or result.pages is None: + raise ValueError( + "Failed to analyze the document with Azure Document Intelligence." + ) + + return result + + +async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) -> dict: + logging.info("Python HTTP trigger function processed a request.") + + storage_account_helper = StorageAccountHelper() + + try: + source = record["data"]["source"] + logging.info(f"Request Body: {record}") + except KeyError: + return { + "recordId": record["recordId"], + "data": {}, + "errors": [ + { + "message": "Failed to extract data with ADI. Pass a valid source in the request body.", + } + ], + "warnings": None, + } + else: + logging.info(f"Source: {source}") + + try: + source_parts = source.split("/") + blob = "/".join(source_parts[4:]) + logging.info(f"Blob: {blob}") + + container = source_parts[3] + + file_extension = blob.split(".")[-1] + target_file_name = f"{record['recordId']}.{file_extension}" + + temp_file_path, _ = await storage_account_helper.download_blob_to_temp_dir( + blob, container, target_file_name + ) + logging.info(temp_file_path) + except Exception as e: + logging.error(f"Failed to download the blob: {e}") + return { + "recordId": record["recordId"], + "data": {}, + "errors": [ + { + "message": f"Failed to download the blob. Check the source and try again. {e}", + } + ], + "warnings": None, + } + + try: + result = await analyse_document(temp_file_path) + except Exception as e: + logging.error(e) + logging.info("Sleeping for 10 seconds and retrying") + await asyncio.sleep(10) + try: + result = await analyse_document(temp_file_path) + except ValueError as inner_e: + logging.error(inner_e) + logging.error( + f"Failed to analyze the document with Azure Document Intelligence: {e}" + ) + logging.error( + "Failed to analyse %s with Azure Document Intelligence.", blob + ) + await storage_account_helper.add_metadata_to_blob( + blob, container, {"AzureSearch_Skip": "true"} + ) + return { + "recordId": record["recordId"], + "data": {}, + "errors": [ + { + "message": f"Failed to analyze the document with Azure Document Intelligence. This blob will now be skipped {inner_e}", + } + ], + "warnings": None, + } + except Exception as inner_e: + logging.error(inner_e) + logging.error( + "Failed to analyse %s with Azure Document Intelligence.", blob + ) + return { + "recordId": record["recordId"], + "data": {}, + "errors": [ + { + "message": f"Failed to analyze the document with Azure Document Intelligence. Check the logs and try again. {inner_e}", + } + ], + "warnings": None, + } + + try: + if chunk_by_page: + markdown_content,page_no = create_page_wise_content(result) + else: + markdown_content = result.content + + # Remove this line when VLM is ready + content_with_figures = markdown_content + + # if chunk_by_page: + # tasks = [ + # process_figures_from_extracted_content( + # temp_file_path, page_content, result.figures, page_number=idx + # ) + # for idx, page_content in enumerate(markdown_content) + # ] + # content_with_figures = await asyncio.gather(*tasks) + # else: + # content_with_figures = await process_figures_from_extracted_content( + # temp_file_path, markdown_content, result.figures + # ) + + # Remove remove_irrelevant_figures=True when VLM is ready + if chunk_by_page: + cleaned_result = [] + with concurrent.futures.ProcessPoolExecutor() as executor: + results = executor.map(clean_adi_markdown,content_with_figures, page_no,[False] * len(content_with_figures)) + + for cleaned_content in results: + cleaned_result.append(cleaned_content) + + # with concurrent.futures.ProcessPoolExecutor() as executor: + # futures = { + # executor.submit( + # clean_adi_markdown, page_content, False + # ): page_content + # for page_content in content_with_figures + # } + # for future in concurrent.futures.as_completed(futures): + # cleaned_result.append(future.result()) + else: + cleaned_result = clean_adi_markdown( + content_with_figures, page_no=-1,remove_irrelevant_figures=False + ) + except Exception as e: + logging.error(e) + logging.error(f"Failed to process the extracted content: {e}") + return { + "recordId": record["recordId"], + "data": {}, + "errors": [ + { + "message": f"Failed to process the extracted content. Check the logs and try again. {e}", + } + ], + "warnings": None, + } + + logging.info("Document Extracted") + logging.info(f"Result: {cleaned_result}") + + src = { + "recordId": record["recordId"], + "data": {"extracted_content": cleaned_result}, + } + + json_str = json.dumps(src, indent=4) + + logging.info(f"final output: {json_str}") + + return { + "recordId": record["recordId"], + "data": {"extracted_content": cleaned_result}, + } diff --git a/function_apps/indexer/function_app.py b/function_apps/indexer/function_app.py new file mode 100644 index 0000000..12d5d5b --- /dev/null +++ b/function_apps/indexer/function_app.py @@ -0,0 +1,296 @@ +from datetime import datetime, timedelta, timezone +import azure.functions as func +import logging +import json +import asyncio + +from adi_2_ai_search import process_adi_2_ai_search +from common.service_bus import ServiceBusHelper +from pre_embedding_cleaner import process_pre_embedding_cleaner + +from text_split import process_text_split +from ai_search_2_compass import process_ai_search_2_compass +from key_phrase_extraction import process_key_phrase_extraction +from ocr import process_ocr +from pending_index_completion import process_pending_index_completion +from pending_index_trigger import process_pending_index_trigger + +from common.payloads.pending_index_trigger import PendingIndexTriggerPayload + +from common.payloads.header import TaskEnum + +logging.basicConfig(level=logging.INFO) +app = func.FunctionApp(http_auth_level=func.AuthLevel.FUNCTION) + + +@app.route(route="text_split", methods=[func.HttpMethod.POST]) +async def text_split(req: func.HttpRequest) -> func.HttpResponse: + """Extract the content from a document using ADI.""" + + try: + req_body = req.get_json() + values = req_body.get("values") + text_split_config = req.headers + except ValueError: + return func.HttpResponse( + "Please valid Custom Skill Payload in the request body", status_code=400 + ) + else: + logging.debug(f"Input Values: {values}") + + record_tasks = [] + + for value in values: + record_tasks.append( + asyncio.create_task(process_text_split(value, text_split_config)) + ) + + results = await asyncio.gather(*record_tasks) + logging.debug(f"Results: {results}") + + return func.HttpResponse( + json.dumps({"values": results}), + status_code=200, + mimetype="application/json", + ) + + +@app.route(route="ai_search_2_compass", methods=[func.HttpMethod.POST]) +async def ai_search_2_compass(req: func.HttpRequest) -> func.HttpResponse: + logging.info("Python HTTP trigger function processed a request.") + + """HTTP trigger for AI Search 2 Compass function. + + Args: + req (func.HttpRequest): The HTTP request object. + + Returns: + func.HttpResponse: The HTTP response object.""" + logging.info("Python HTTP trigger function processed a request.") + + try: + req_body = req.get_json() + values = req_body.get("values") + except ValueError: + return func.HttpResponse( + "Please valid Custom Skill Payload in the request body", status_code=400 + ) + else: + logging.debug("Input Values: %s", values) + + record_tasks = [] + + for value in values: + record_tasks.append(asyncio.create_task(process_ai_search_2_compass(value))) + + results = await asyncio.gather(*record_tasks) + logging.debug("Results: %s", results) + vectorised_tasks = {"values": results} + + return func.HttpResponse( + json.dumps(vectorised_tasks), status_code=200, mimetype="application/json" + ) + + +@app.route(route="adi_2_ai_search", methods=[func.HttpMethod.POST]) +async def adi_2_ai_search(req: func.HttpRequest) -> func.HttpResponse: + """Extract the content from a document using ADI.""" + + try: + req_body = req.get_json() + values = req_body.get("values") + adi_config = req.headers + + chunk_by_page = adi_config.get("chunk_by_page", "False").lower() == "true" + logging.info(f"Chunk by Page: {chunk_by_page}") + except ValueError: + return func.HttpResponse( + "Please valid Custom Skill Payload in the request body", status_code=400 + ) + else: + logging.debug("Input Values: %s", values) + + record_tasks = [] + + for value in values: + record_tasks.append( + asyncio.create_task( + process_adi_2_ai_search(value, chunk_by_page=chunk_by_page) + ) + ) + + results = await asyncio.gather(*record_tasks) + logging.debug("Results: %s", results) + + return func.HttpResponse( + json.dumps({"values": results}), + status_code=200, + mimetype="application/json", + ) + + +@app.route(route="pre_embedding_cleaner", methods=[func.HttpMethod.POST]) +async def pre_embedding_cleaner(req: func.HttpRequest) -> func.HttpResponse: + """HTTP trigger for data cleanup function. + + Args: + req (func.HttpRequest): The HTTP request object. + + Returns: + func.HttpResponse: The HTTP response object.""" + logging.info("Python HTTP trigger data cleanup function processed a request.") + + try: + req_body = req.get_json() + values = req_body.get("values") + except ValueError: + return func.HttpResponse( + "Please valid Custom Skill Payload in the request body", status_code=400 + ) + else: + logging.debug("Input Values: %s", values) + + record_tasks = [] + + for value in values: + record_tasks.append( + asyncio.create_task(process_pre_embedding_cleaner(value)) + ) + + results = await asyncio.gather(*record_tasks) + logging.debug("Results: %s", results) + cleaned_tasks = {"values": results} + + return func.HttpResponse( + json.dumps(cleaned_tasks), status_code=200, mimetype="application/json" + ) + + +@app.route(route="keyphrase_extractor", methods=[func.HttpMethod.POST]) +async def keyphrase_extractor(req: func.HttpRequest) -> func.HttpResponse: + """HTTP trigger for data cleanup function. + + Args: + req (func.HttpRequest): The HTTP request object. + + Returns: + func.HttpResponse: The HTTP response object.""" + logging.info("Python HTTP trigger data cleanup function processed a request.") + + try: + req_body = req.get_json() + values = req_body.get("values") + logging.info(req_body) + except ValueError: + return func.HttpResponse( + "Please valid Custom Skill Payload in the request body", status_code=400 + ) + else: + logging.debug("Input Values: %s", values) + + record_tasks = [] + + for value in values: + record_tasks.append( + asyncio.create_task(process_key_phrase_extraction(value)) + ) + + results = await asyncio.gather(*record_tasks) + logging.debug("Results: %s", results) + cleaned_tasks = {"values": results} + + return func.HttpResponse( + json.dumps(cleaned_tasks), status_code=200, mimetype="application/json" + ) + + +@app.route(route="ocr", methods=[func.HttpMethod.POST]) +async def ocr(req: func.HttpRequest) -> func.HttpResponse: + """HTTP trigger for data cleanup function. + + Args: + req (func.HttpRequest): The HTTP request object. + + Returns: + func.HttpResponse: The HTTP response object.""" + logging.info("Python HTTP trigger data cleanup function processed a request.") + + try: + req_body = req.get_json() + values = req_body.get("values") + except ValueError: + return func.HttpResponse( + "Please valid Custom Skill Payload in the request body", status_code=400 + ) + else: + logging.debug("Input Values: %s", values) + + record_tasks = [] + + for value in values: + record_tasks.append(asyncio.create_task(process_ocr(value))) + + results = await asyncio.gather(*record_tasks) + logging.debug("Results: %s", results) + cleaned_tasks = {"values": results} + + return func.HttpResponse( + json.dumps(cleaned_tasks), status_code=200, mimetype="application/json" + ) + + +@app.service_bus_queue_trigger( + arg_name="msg", + queue_name="pending_index_trigger", + connection="ServiceBusTrigger", +) +async def pending_index_trigger(msg: func.ServiceBusMessage): + logging.info( + f"trigger-indexer: Python ServiceBus queue trigger processed message: {msg}" + ) + try: + payload = PendingIndexTriggerPayload.from_service_bus_message(msg) + await process_pending_index_trigger(payload) + except ValueError as ve: + logging.error(f"ValueError: {ve}") + except Exception as e: + logging.error(f"Error processing ServiceBus message: {e}") + + if "On-demand indexer invocation is permitted every 180 seconds" in str(e): + logging.warning( + f"Indexer invocation limit reached: {e}. Scheduling a retry." + ) + service_bus_helper = ServiceBusHelper() + message = PendingIndexTriggerPayload( + header=payload.header, body=payload.body, errors=[] + ) + queue = TaskEnum.PENDING_INDEX_TRIGGER.value + minutes = 2 ** (11 - payload.header.retries_remaining) + enqueue_time = datetime.now(timezone.utc) + timedelta(minutes=minutes) + await service_bus_helper.send_message_to_service_bus_queue( + queue, message, enqueue_time=enqueue_time + ) + else: + raise e + + +@app.service_bus_queue_trigger( + arg_name="msg", + queue_name="pending_index_completion", + connection="ServiceBusTrigger", +) +async def pending_index_completion(msg: func.ServiceBusMessage): + logging.info( + f"indexer-polling-trigger: Python ServiceBus queue trigger processed message: {msg}" + ) + + try: + payload = PendingIndexTriggerPayload.from_service_bus_message(msg) + await process_pending_index_completion(payload) + except ValueError as ve: + logging.error(f"ValueError: {ve}") + except Exception as e: + logging.error(f"Error processing ServiceBus message: {e}") + if "The operation has timed out" in str(e): + logging.error("The operation has timed out.") + raise e diff --git a/function_apps/indexer/key_phrase_extraction.py b/function_apps/indexer/key_phrase_extraction.py new file mode 100644 index 0000000..c6ab40e --- /dev/null +++ b/function_apps/indexer/key_phrase_extraction.py @@ -0,0 +1,112 @@ +import logging +import json +import os +from azure.ai.textanalytics.aio import TextAnalyticsClient +from azure.core.exceptions import HttpResponseError +from azure.core.credentials import AzureKeyCredential +import asyncio + +MAX_TEXT_ELEMENTS = 5120 + +def split_document(document, max_size): + """Split a document into chunks of max_size.""" + return [document[i:i + max_size] for i in range(0, len(document), max_size)] + +async def extract_key_phrases_from_text(data: list[str],max_key_phrase_count:int) -> list[str]: + logging.info("Python HTTP trigger function processed a request.") + + max_retries = 5 + key_phrase_list = [] + text_analytics_client = TextAnalyticsClient( + endpoint=os.environ["AIService__Services__Endpoint"], + credential=AzureKeyCredential(os.environ["AIService__Services__Key"]), + ) + + try: + async with text_analytics_client: + retries = 0 + while retries < max_retries: + try: + # Split large documents + split_documents = [] + for doc in data: + if len(doc) > MAX_TEXT_ELEMENTS: + split_documents.extend(split_document(doc, MAX_TEXT_ELEMENTS)) + else: + split_documents.append(doc) + result = await text_analytics_client.extract_key_phrases(split_documents) + for idx,doc in enumerate(result): + if not doc.is_error: + key_phrase_list.extend(doc.key_phrases[:max_key_phrase_count]) + else: + raise Exception(f"Document {idx} error: {doc.error}") + break # Exit the loop if the request is successful + except HttpResponseError as e: + if e.status_code == 429: # Rate limiting error + retries += 1 + wait_time = 2 ** retries # Exponential backoff + print(f"Rate limit exceeded. Retrying in {wait_time} seconds...") + await asyncio.sleep(wait_time) + else: + raise Exception(f"An error occurred: {e}") + except Exception as e: + raise Exception(f"An error occurred: {e}") + + return key_phrase_list + + +async def process_key_phrase_extraction(record: dict,max_key_phrase_count:int =5 ) -> dict: + """Extract key phrases using azure ai services. + + Args: + record (dict): The record to process. + max_key_phrase_count(int): no of keywords to return + + Returns: + dict: extracted key words.""" + + try: + json_str = json.dumps(record, indent=4) + + logging.info(f"key phrase extraction Input: {json_str}") + extracted_record = { + "recordId": record["recordId"], + "data": {}, + "errors": None, + "warnings": None, + } + extracted_record["data"]["keyPhrases"] = await extract_key_phrases_from_text( + [record["data"]["text"]],max_key_phrase_count + ) + except Exception as e: + logging.error("key phrase extraction Error: %s", e) + await asyncio.sleep(10) + try: + extracted_record = { + "recordId": record["recordId"], + "data": {}, + "errors": None, + "warnings": None, + } + extracted_record["data"][ + "keyPhrases" + ] = await extract_key_phrases_from_text([record["data"]["text"]],max_key_phrase_count) + except Exception as inner_e: + logging.error("key phrase extraction Error: %s", inner_e) + logging.error( + "Failed to extract key phrase. Check function app logs for more details of exact failure." + ) + return { + "recordId": record["recordId"], + "data": {}, + "errors": [ + { + "message": "Failed to extract key phrase. Check function app logs for more details of exact failure." + } + ], + "warnings": None, + } + json_str = json.dumps(extracted_record, indent=4) + + logging.info(f"key phrase extraction output: {json_str}") + return extracted_record diff --git a/function_apps/indexer/pre_embedding_cleaner.py b/function_apps/indexer/pre_embedding_cleaner.py new file mode 100644 index 0000000..2fdf87a --- /dev/null +++ b/function_apps/indexer/pre_embedding_cleaner.py @@ -0,0 +1,144 @@ +import logging +import json +import string +import nltk +import re +from nltk.tokenize import word_tokenize + +nltk.download("punkt") +nltk.download("stopwords") + +import re + +# Configure logging +logging.basicConfig(level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s') + +def get_section(cleaned_text:str) -> list: + """ + Returns the section details from the content + + Args: + cleaned_text: The input text + + Returns: + list: The sections related to text + + """ + combined_pattern = r'(.*?)\n===|\n## ?(.*?)\n|\n### ?(.*?)\n' + doc_metadata = re.findall(combined_pattern, cleaned_text, re.DOTALL) + doc_metadata = [match for group in doc_metadata for match in group if match] + return doc_metadata + +def remove_markdown_tags(text:str, tag_patterns:dict) ->str: + """ + Remove specified Markdown tags from the text, keeping the contents of the tags. + + Args: + text: The input text containing Markdown tags. + tag_patterns: A dictionary where keys are tags and values are their specific patterns. + + Returns: + str: The text with specified tags removed. + """ + try: + for tag, pattern in tag_patterns.items(): + try: + # Replace the tags using the specific pattern, keeping the content inside the tags + text = re.sub(pattern, r'\1', text, flags=re.DOTALL) + except re.error as e: + logging.error(f"Regex error for tag '{tag}': {e}") + except Exception as e: + logging.error(f"An error occurred in remove_markdown_tags: {e}") + return text + +def clean_text(src_text: str) -> str: + """This function performs following cleanup activities on the text, remove all unicode characters + remove line spacing,remove stop words, normalize characters + + Args: + src_text (str): The text to cleanup. + + Returns: + str: The clean text.""" + + try: + # Define specific patterns for each tag + tag_patterns = { + "figurecontent": r"", + "figure": r"
(.*?)
", + "figures": r"\(figures/\d+\)(.*?)\(figures/\d+\)", + "figcaption": r"
(.*?)
", + } + cleaned_text = remove_markdown_tags(src_text, tag_patterns) + + # remove line breaks + cleaned_text = re.sub(r"\n", "", cleaned_text) + + # remove stopwords + tokens = word_tokenize(cleaned_text, "english") + stop_words = nltk.corpus.stopwords.words("english") + filtered_tokens = [word for word in tokens if word not in stop_words] + cleaned_text = " ".join(filtered_tokens) + + # remove special characters + cleaned_text = re.sub(r"[^a-zA-Z\s]", "", cleaned_text) + + # remove extra white spaces + cleaned_text = " ".join([word for word in cleaned_text.split()]) + + # case normalization + cleaned_text = cleaned_text.lower() + except Exception as e: + logging.error(f"An error occurred in clean_text: {e}") + return "" + return cleaned_text + + +async def process_pre_embedding_cleaner(record: dict) -> dict: + """Cleanup the data using standard python libraries. + + Args: + record (dict): The record to cleanup. + + Returns: + dict: The clean record.""" + + try: + json_str = json.dumps(record, indent=4) + + logging.info(f"embedding cleaner Input: {json_str}") + + cleaned_record = { + "recordId": record["recordId"], + "data": {}, + "errors": None, + "warnings": None, + } + + # scenarios when page by chunking is enabled + if isinstance(record["data"]["chunk"],dict): + cleaned_record["data"]["cleaned_chunk"] = clean_text(record["data"]["chunk"]["content"]) + cleaned_record["data"]["chunk"] = record["data"]["chunk"]["content"] + cleaned_record["data"]["section"] = record["data"]["chunk"]["section"] + cleaned_record["data"]["page_number"] = record["data"]["chunk"]["page_number"] + else: + cleaned_record["data"]["cleaned_chunk"] = clean_text(record["data"]["chunk"]) + cleaned_record["data"]["chunk"] = record["data"]["chunk"] + cleaned_record["data"]["section"] = get_section(record["data"]["chunk"]) + + except Exception as e: + logging.error("string cleanup Error: %s", e) + return { + "recordId": record["recordId"], + "data": {}, + "errors": [ + { + "message": "Failed to cleanup data. Check function app logs for more details of exact failure." + } + ], + "warnings": None, + } + json_str = json.dumps(cleaned_record, indent=4) + + logging.info(f"embedding cleaner output: {json_str}") + return cleaned_record diff --git a/function_apps/indexer/requirements.txt b/function_apps/indexer/requirements.txt new file mode 100644 index 0000000..48c9837 --- /dev/null +++ b/function_apps/indexer/requirements.txt @@ -0,0 +1,26 @@ +# DO NOT include azure-functions-worker in this file +# The Python Worker is managed by Azure Functions platform +# Manually managing azure-functions-worker may cause unexpected issues +python-dotenv +azure-functions +openai +azure-storage-blob +pandas +azure-identity +openpyxl +regex +nltk==3.8.1 +bs4 +azure-search +azure-search-documents +azure-ai-documentintelligence +azure-ai-textanalytics +azure-ai-vision-imageanalysis +PyMuPDF +pillow +torch +aiohttp +spacy==3.7.5 +transformers +scikit-learn +en-core-web-md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1.tar.gz From 8d177a003b33b6b5433a91797149822fb0bd121f Mon Sep 17 00:00:00 2001 From: priyal1508 <54278892+priyal1508@users.noreply.github.com> Date: Thu, 5 Sep 2024 19:11:57 +0530 Subject: [PATCH 02/33] changes for common scripts --- aisearch-skillset/deploy.py | 13 ------ aisearch-skillset/inquiry_document.py | 16 ++++---- function_apps/common/payloads/error.py | 20 ++++++++++ function_apps/common/payloads/header.py | 40 +++++++++++++++++++ function_apps/common/payloads/payload.py | 20 ++++++++++ .../payloads/pending_index_completion.py | 40 +++++++++++++++++++ .../common/payloads/pennding_index_trigger.py | 32 +++++++++++++++ .../indexer/pending_index_completion.py | 0 8 files changed, 160 insertions(+), 21 deletions(-) create mode 100644 function_apps/common/payloads/error.py create mode 100644 function_apps/common/payloads/header.py create mode 100644 function_apps/common/payloads/payload.py create mode 100644 function_apps/common/payloads/pending_index_completion.py create mode 100644 function_apps/common/payloads/pennding_index_trigger.py create mode 100644 function_apps/indexer/pending_index_completion.py diff --git a/aisearch-skillset/deploy.py b/aisearch-skillset/deploy.py index d98e099..1b2190b 100644 --- a/aisearch-skillset/deploy.py +++ b/aisearch-skillset/deploy.py @@ -30,19 +30,6 @@ def main(args): rebuild=args.rebuild, enable_page_by_chunking=args.enable_page_chunking ) - elif args.indexer_type == "summary": - # Deploy the summarises index - index_config = SummaryDocumentAISearch( - endpoint=endpoint, - credential=credential, - suffix=args.suffix, - rebuild=args.rebuild, - enable_page_by_chunking=args.enable_page_chunking - ) - elif args.indexer_type == "glossary": - # Deploy business glossary index - index_config = BusinessGlossaryAISearch(endpoint, credential) - index_config.deploy() if args.rebuild: diff --git a/aisearch-skillset/inquiry_document.py b/aisearch-skillset/inquiry_document.py index 3f9dd0a..b70251e 100644 --- a/aisearch-skillset/inquiry_document.py +++ b/aisearch-skillset/inquiry_document.py @@ -63,14 +63,14 @@ def get_index_fields(self) -> list[SearchableField]: name="Title", type=SearchFieldDataType.String, filterable=True ), SearchableField( - name="DealId", + name="ID1", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True, ), SearchableField( - name="OracleId", + name="ID2", type=SearchFieldDataType.String, sortable=True, filterable=True, @@ -212,12 +212,12 @@ def get_index_projections(self) -> SearchIndexerIndexProjections: source="/document/Title" ), InputFieldMappingEntry( - name="DealId", - source="/document/DealId" + name="ID1", + source="/document/ID1" ), InputFieldMappingEntry( - name="OracleId", - source="/document/OracleId" + name="ID2", + source="/document/ID2" ), InputFieldMappingEntry( name="SourceUrl", @@ -302,9 +302,9 @@ def get_indexer(self) -> SearchIndexer: FieldMapping( source_field_name="metadata_storage_name", target_field_name="Title" ), - FieldMapping(source_field_name="Deal_ID", target_field_name="DealId"), + FieldMapping(source_field_name="ID1", target_field_name="ID1"), FieldMapping( - source_field_name="Oracle_ID", target_field_name="OracleId" + source_field_name="ID2", target_field_name="ID2" ), FieldMapping( source_field_name="SharePointUrl", target_field_name="SourceUrl" diff --git a/function_apps/common/payloads/error.py b/function_apps/common/payloads/error.py new file mode 100644 index 0000000..49e456e --- /dev/null +++ b/function_apps/common/payloads/error.py @@ -0,0 +1,20 @@ +from typing import Optional +from pydantic import BaseModel, Field, ConfigDict +from datetime import datetime, timezone + + +class Error(BaseModel): + """Error item model""" + + code: str = Field(..., description="The error code") + message: str = Field(..., description="The error message") + details: Optional[str] = Field( + None, description="Detailed error information from Python" + ) + timestamp: Optional[datetime] = Field( + ..., + description="Creation timestamp in UTC", + default_factory=lambda: datetime.now(timezone.utc), + ) + + __config__ = ConfigDict(extra="ignore") diff --git a/function_apps/common/payloads/header.py b/function_apps/common/payloads/header.py new file mode 100644 index 0000000..e7a521c --- /dev/null +++ b/function_apps/common/payloads/header.py @@ -0,0 +1,40 @@ +from pydantic import BaseModel, Field, ConfigDict +from datetime import datetime, timezone +from enum import Enum + + +class DataTypeEnum(Enum): + """Type enum""" + + BUSINESS_GLOSSARY = "business_glossary" + SUMMARY = "summary" + + +class TaskEnum(Enum): + """Task enum""" + + PENDING_INDEX_COMPLETION = "pending_index_completion" + PENDING_INDEX_TRIGGER = "pending_index_trigger" + PENDING_SUMMARY_GENERATION = "pending_summary_generation" + + +class Header(BaseModel): + """Header model""" + + creation_timestamp: datetime = Field( + ..., + description="Creation timestamp in UTC", + default_factory=lambda: datetime.now(timezone.utc), + ) + last_processed_timestamp: datetime = Field( + ..., + description="Last processed timestamp in UTC", + default_factory=lambda: datetime.now(timezone.utc), + ) + retries_remaining: int = Field( + description="Number of retries remaining", default=10 + ) + data_type: DataTypeEnum = Field(..., description="Data type") + task: TaskEnum = Field(..., description="Task name") + + __config__ = ConfigDict(extra="ignore") diff --git a/function_apps/common/payloads/payload.py b/function_apps/common/payloads/payload.py new file mode 100644 index 0000000..fb2f4f9 --- /dev/null +++ b/function_apps/common/payloads/payload.py @@ -0,0 +1,20 @@ +from pydantic import BaseModel, ConfigDict +import logging + + +class Payload(BaseModel): + """Body model""" + + @classmethod + def from_service_bus_message(cls, message): + """ + Create a Payload object from a ServiceBusMessage object. + + :param message: The ServiceBusMessage object. + :return: The Body object. + """ + message = message.get_body().decode("utf-8") + logging.info(f"ServiceBus message: {message}") + return cls.model_validate_json(message) + + __config__ = ConfigDict(extra="ignore") diff --git a/function_apps/common/payloads/pending_index_completion.py b/function_apps/common/payloads/pending_index_completion.py new file mode 100644 index 0000000..8aa0335 --- /dev/null +++ b/function_apps/common/payloads/pending_index_completion.py @@ -0,0 +1,40 @@ +from pydantic import BaseModel, Field, ConfigDict +from datetime import datetime, timezone +from typing import Optional, List + +from common.payloads.header import Header +from common.payloads.error import Error +from common.payloads.payload import Payload + + +class PendingIndexCompletionBody(BaseModel): + """Body model""" + + indexer: str = Field(..., description="The indexer to trigger") + deal_id: Optional[int] = Field(None, description="The deal ID") + blob_storage_url: Optional[str] = Field( + ..., description="The URL to the blob storage" + ) + deal_name: Optional[str] = Field( + None, description="The text name for the integer deal ID" + ) + business_unit: Optional[str] = Field(None, description="The business unit") + indexer_start_time: Optional[datetime] = Field( + ..., + description="The time the indexer was triggered successfully", + default_factory=lambda: datetime.now(timezone.utc), + ) + + __config__ = ConfigDict(extra="ignore") + + +class PendingIndexCompletionPayload(Payload): + """Pending Index Trigger model""" + + header: Header = Field(..., description="Header information") + body: PendingIndexCompletionBody = Field(..., description="Body information") + errors: List[Error] | None = Field( + ..., description="List of errors", default_factory=list + ) + + __config__ = ConfigDict(extra="ignore") diff --git a/function_apps/common/payloads/pennding_index_trigger.py b/function_apps/common/payloads/pennding_index_trigger.py new file mode 100644 index 0000000..2a519d9 --- /dev/null +++ b/function_apps/common/payloads/pennding_index_trigger.py @@ -0,0 +1,32 @@ +from pydantic import BaseModel, Field, ConfigDict +from typing import Optional, List + +from common.payloads.header import Header +from common.payloads.error import Error +from common.payloads.payload import Payload + + +class PendingIndexTriggerBody(BaseModel): + """Body model""" + + indexer: str = Field(..., description="The indexer to trigger") + deal_id: Optional[int] = Field(None, description="The deal ID") + blob_storage_url: str = Field(..., description="The URL to the blob storage") + deal_name: Optional[str] = Field( + None, description="The text name for the integer deal ID" + ) + business_unit: Optional[str] = Field(None, description="The business unit") + + __config__ = ConfigDict(extra="ignore") + + +class PendingIndexTriggerPayload(Payload): + """Pending Index Trigger model""" + + header: Header = Field(..., description="Header information") + body: PendingIndexTriggerBody = Field(..., description="Body information") + errors: List[Error] | None = Field( + ..., description="List of errors", default_factory=list + ) + + __config__ = ConfigDict(extra="ignore") diff --git a/function_apps/indexer/pending_index_completion.py b/function_apps/indexer/pending_index_completion.py new file mode 100644 index 0000000..e69de29 From 461702883d1f9c15725d87a396bcb7862a318092 Mon Sep 17 00:00:00 2001 From: priyal1508 <54278892+priyal1508@users.noreply.github.com> Date: Thu, 5 Sep 2024 19:14:10 +0530 Subject: [PATCH 03/33] fixing bugs --- .../{pennding_index_trigger.py => pending_index_trigger.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename function_apps/common/payloads/{pennding_index_trigger.py => pending_index_trigger.py} (100%) diff --git a/function_apps/common/payloads/pennding_index_trigger.py b/function_apps/common/payloads/pending_index_trigger.py similarity index 100% rename from function_apps/common/payloads/pennding_index_trigger.py rename to function_apps/common/payloads/pending_index_trigger.py From de48566b1c9c01478ad533e7cc48cca27e420d03 Mon Sep 17 00:00:00 2001 From: priyal1508 <54278892+priyal1508@users.noreply.github.com> Date: Thu, 5 Sep 2024 19:21:08 +0530 Subject: [PATCH 04/33] changes in fodler structure --- {aisearch-skillset => ai_search_with_adi}/ai_search.py | 0 {aisearch-skillset => ai_search_with_adi}/deploy.py | 0 {aisearch-skillset => ai_search_with_adi}/environment.py | 0 .../function_apps}/common/ai_search.py | 0 .../function_apps}/common/payloads/error.py | 0 .../function_apps}/common/payloads/header.py | 0 .../function_apps}/common/payloads/payload.py | 0 .../function_apps}/common/payloads/pending_index_completion.py | 0 .../function_apps}/common/payloads/pending_index_trigger.py | 0 .../function_apps}/indexer/adi_2_aisearch.py | 0 .../function_apps}/indexer/function_app.py | 0 .../function_apps}/indexer/key_phrase_extraction.py | 0 .../function_apps}/indexer/pending_index_completion.py | 0 .../function_apps}/indexer/pre_embedding_cleaner.py | 0 .../function_apps}/indexer/requirements.txt | 0 {aisearch-skillset => ai_search_with_adi}/inquiry_document.py | 0 16 files changed, 0 insertions(+), 0 deletions(-) rename {aisearch-skillset => ai_search_with_adi}/ai_search.py (100%) rename {aisearch-skillset => ai_search_with_adi}/deploy.py (100%) rename {aisearch-skillset => ai_search_with_adi}/environment.py (100%) rename {function_apps => ai_search_with_adi/function_apps}/common/ai_search.py (100%) rename {function_apps => ai_search_with_adi/function_apps}/common/payloads/error.py (100%) rename {function_apps => ai_search_with_adi/function_apps}/common/payloads/header.py (100%) rename {function_apps => ai_search_with_adi/function_apps}/common/payloads/payload.py (100%) rename {function_apps => ai_search_with_adi/function_apps}/common/payloads/pending_index_completion.py (100%) rename {function_apps => ai_search_with_adi/function_apps}/common/payloads/pending_index_trigger.py (100%) rename {function_apps => ai_search_with_adi/function_apps}/indexer/adi_2_aisearch.py (100%) rename {function_apps => ai_search_with_adi/function_apps}/indexer/function_app.py (100%) rename {function_apps => ai_search_with_adi/function_apps}/indexer/key_phrase_extraction.py (100%) rename {function_apps => ai_search_with_adi/function_apps}/indexer/pending_index_completion.py (100%) rename {function_apps => ai_search_with_adi/function_apps}/indexer/pre_embedding_cleaner.py (100%) rename {function_apps => ai_search_with_adi/function_apps}/indexer/requirements.txt (100%) rename {aisearch-skillset => ai_search_with_adi}/inquiry_document.py (100%) diff --git a/aisearch-skillset/ai_search.py b/ai_search_with_adi/ai_search.py similarity index 100% rename from aisearch-skillset/ai_search.py rename to ai_search_with_adi/ai_search.py diff --git a/aisearch-skillset/deploy.py b/ai_search_with_adi/deploy.py similarity index 100% rename from aisearch-skillset/deploy.py rename to ai_search_with_adi/deploy.py diff --git a/aisearch-skillset/environment.py b/ai_search_with_adi/environment.py similarity index 100% rename from aisearch-skillset/environment.py rename to ai_search_with_adi/environment.py diff --git a/function_apps/common/ai_search.py b/ai_search_with_adi/function_apps/common/ai_search.py similarity index 100% rename from function_apps/common/ai_search.py rename to ai_search_with_adi/function_apps/common/ai_search.py diff --git a/function_apps/common/payloads/error.py b/ai_search_with_adi/function_apps/common/payloads/error.py similarity index 100% rename from function_apps/common/payloads/error.py rename to ai_search_with_adi/function_apps/common/payloads/error.py diff --git a/function_apps/common/payloads/header.py b/ai_search_with_adi/function_apps/common/payloads/header.py similarity index 100% rename from function_apps/common/payloads/header.py rename to ai_search_with_adi/function_apps/common/payloads/header.py diff --git a/function_apps/common/payloads/payload.py b/ai_search_with_adi/function_apps/common/payloads/payload.py similarity index 100% rename from function_apps/common/payloads/payload.py rename to ai_search_with_adi/function_apps/common/payloads/payload.py diff --git a/function_apps/common/payloads/pending_index_completion.py b/ai_search_with_adi/function_apps/common/payloads/pending_index_completion.py similarity index 100% rename from function_apps/common/payloads/pending_index_completion.py rename to ai_search_with_adi/function_apps/common/payloads/pending_index_completion.py diff --git a/function_apps/common/payloads/pending_index_trigger.py b/ai_search_with_adi/function_apps/common/payloads/pending_index_trigger.py similarity index 100% rename from function_apps/common/payloads/pending_index_trigger.py rename to ai_search_with_adi/function_apps/common/payloads/pending_index_trigger.py diff --git a/function_apps/indexer/adi_2_aisearch.py b/ai_search_with_adi/function_apps/indexer/adi_2_aisearch.py similarity index 100% rename from function_apps/indexer/adi_2_aisearch.py rename to ai_search_with_adi/function_apps/indexer/adi_2_aisearch.py diff --git a/function_apps/indexer/function_app.py b/ai_search_with_adi/function_apps/indexer/function_app.py similarity index 100% rename from function_apps/indexer/function_app.py rename to ai_search_with_adi/function_apps/indexer/function_app.py diff --git a/function_apps/indexer/key_phrase_extraction.py b/ai_search_with_adi/function_apps/indexer/key_phrase_extraction.py similarity index 100% rename from function_apps/indexer/key_phrase_extraction.py rename to ai_search_with_adi/function_apps/indexer/key_phrase_extraction.py diff --git a/function_apps/indexer/pending_index_completion.py b/ai_search_with_adi/function_apps/indexer/pending_index_completion.py similarity index 100% rename from function_apps/indexer/pending_index_completion.py rename to ai_search_with_adi/function_apps/indexer/pending_index_completion.py diff --git a/function_apps/indexer/pre_embedding_cleaner.py b/ai_search_with_adi/function_apps/indexer/pre_embedding_cleaner.py similarity index 100% rename from function_apps/indexer/pre_embedding_cleaner.py rename to ai_search_with_adi/function_apps/indexer/pre_embedding_cleaner.py diff --git a/function_apps/indexer/requirements.txt b/ai_search_with_adi/function_apps/indexer/requirements.txt similarity index 100% rename from function_apps/indexer/requirements.txt rename to ai_search_with_adi/function_apps/indexer/requirements.txt diff --git a/aisearch-skillset/inquiry_document.py b/ai_search_with_adi/inquiry_document.py similarity index 100% rename from aisearch-skillset/inquiry_document.py rename to ai_search_with_adi/inquiry_document.py From e9a0b8e112351cc5ad53587934e0b22843f09b56 Mon Sep 17 00:00:00 2001 From: priyal1508 <54278892+priyal1508@users.noreply.github.com> Date: Mon, 9 Sep 2024 19:14:21 +0530 Subject: [PATCH 05/33] adi and indexer changes --- .../{ => ai_search}/ai_search.py | 137 ++++--- ai_search_with_adi/{ => ai_search}/deploy.py | 34 +- .../{ => ai_search}/environment.py | 3 + .../{ => ai_search}/inquiry_document.py | 161 ++++---- .../function_apps/common/ai_search.py | 9 +- .../common/delay_processing_exception.py | 4 + .../function_apps/common/payloads/error.py | 3 + .../function_apps/common/payloads/header.py | 5 +- .../function_apps/common/payloads/payload.py | 3 + .../payloads/pending_index_completion.py | 9 +- .../common/payloads/pending_index_trigger.py | 13 +- .../function_apps/common/requirements.txt | 11 + .../function_apps/common/service_bus.py | 46 +++ .../function_apps/common/storage_account.py | 78 ++++ .../{adi_2_aisearch.py => adi_2_ai_search.py} | 151 +++++--- .../function_apps/indexer/function_app.py | 3 + .../indexer/key_phrase_extraction.py | 3 + .../function_apps/indexer/ocr.py | 86 +++++ .../indexer/pending_index_completion.py | 107 ++++++ .../indexer/pending_index_trigger.py | 94 +++++ .../indexer/pre_embedding_cleaner.py | 3 + .../function_apps/indexer/text_split.py | 355 ++++++++++++++++++ 22 files changed, 1087 insertions(+), 231 deletions(-) rename ai_search_with_adi/{ => ai_search}/ai_search.py (91%) rename ai_search_with_adi/{ => ai_search}/deploy.py (67%) rename ai_search_with_adi/{ => ai_search}/environment.py (98%) rename ai_search_with_adi/{ => ai_search}/inquiry_document.py (66%) create mode 100644 ai_search_with_adi/function_apps/common/delay_processing_exception.py create mode 100644 ai_search_with_adi/function_apps/common/requirements.txt create mode 100644 ai_search_with_adi/function_apps/common/service_bus.py create mode 100644 ai_search_with_adi/function_apps/common/storage_account.py rename ai_search_with_adi/function_apps/indexer/{adi_2_aisearch.py => adi_2_ai_search.py} (80%) create mode 100644 ai_search_with_adi/function_apps/indexer/ocr.py create mode 100644 ai_search_with_adi/function_apps/indexer/pending_index_trigger.py create mode 100644 ai_search_with_adi/function_apps/indexer/text_split.py diff --git a/ai_search_with_adi/ai_search.py b/ai_search_with_adi/ai_search/ai_search.py similarity index 91% rename from ai_search_with_adi/ai_search.py rename to ai_search_with_adi/ai_search/ai_search.py index 7573055..6ababd7 100644 --- a/ai_search_with_adi/ai_search.py +++ b/ai_search_with_adi/ai_search/ai_search.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + from abc import ABC, abstractmethod from azure.search.documents.indexes.models import ( SearchIndex, @@ -28,7 +31,7 @@ ) from azure.core.exceptions import HttpResponseError from azure.search.documents.indexes import SearchIndexerClient, SearchIndexClient -from environment import ( +from ai_search_with_adi.ai_search.environment import ( get_fq_blob_connection_string, get_blob_container_name, get_custom_skill_function_url, @@ -70,31 +73,48 @@ def __init__( @property def indexer_name(self): + """Get the indexer name for the indexer.""" return f"{str(self.indexer_type.value)}-indexer{self.suffix}" @property def skillset_name(self): + """Get the skillset name for the indexer.""" return f"{str(self.indexer_type.value)}-skillset{self.suffix}" @property def semantic_config_name(self): + """Get the semantic config name for the indexer.""" return f"{str(self.indexer_type.value)}-semantic-config{self.suffix}" @property def index_name(self): + """Get the index name for the indexer.""" return f"{str(self.indexer_type.value)}-index{self.suffix}" @property def data_source_name(self): + """Get the data source name for the indexer.""" blob_container_name = get_blob_container_name(self.indexer_type) return f"{blob_container_name}-data-source{self.suffix}" @property def vector_search_profile_name(self): + """Get the vector search profile name for the indexer.""" return ( f"{str(self.indexer_type.value)}-compass-vector-search-profile{self.suffix}" ) + @property + def vectorizer_name(self): + """Get the vectorizer name.""" + return f"{str(self.indexer_type.value)}-compass-vectorizer{self.suffix}" + + @property + def algorithm_name(self): + """Gtt the algorithm name""" + + return f"{str(self.indexer_type.value)}-hnsw-algorithm{self.suffix}" + @abstractmethod def get_index_fields(self) -> list[SearchableField]: """Get the index fields for the indexer. @@ -122,6 +142,7 @@ def get_index_projections(self): return None def get_synonym_map_names(self): + """Get the synonym map names for the indexer.""" return [] def get_user_assigned_managed_identity( @@ -292,67 +313,7 @@ def get_text_split_skill(self, context, source) -> SplitSkill: return text_split_skill - def get_custom_text_split_skill( - self, - context, - source, - text_split_mode="semantic", - maximum_page_length=1000, - separator=" ", - initial_threshold=0.7, - appending_threshold=0.6, - merging_threshold=0.6, - ) -> WebApiSkill: - """Get the custom skill for text split. - - Args: - ----- - context (str): The context of the skill - inputs (List[InputFieldMappingEntry]): The inputs of the skill - outputs (List[OutputFieldMappingEntry]): The outputs of the skill - - Returns: - -------- - WebApiSkill: The custom skill for text split""" - - if self.test: - batch_size = 2 - degree_of_parallelism = 2 - else: - batch_size = 2 - degree_of_parallelism = 6 - - text_split_skill_inputs = [ - InputFieldMappingEntry(name="text", source=source), - ] - - headers = { - "text_split_mode": text_split_mode, - "maximum_page_length": maximum_page_length, - "separator": separator, - "initial_threshold": initial_threshold, - "appending_threshold": appending_threshold, - "merging_threshold": merging_threshold, - } - - text_split_skill = WebApiSkill( - name="Text Split Skill", - description="Skill to split the text before sending to embedding", - context=context, - uri=get_custom_skill_function_url("split"), - timeout="PT230S", - batch_size=batch_size, - degree_of_parallelism=degree_of_parallelism, - http_method="POST", - http_headers=headers, - inputs=text_split_skill_inputs, - outputs=[OutputFieldMappingEntry(name="chunks", target_name="pages")], - auth_resource_id=get_function_app_authresourceid(), - auth_identity=self.get_user_assigned_managed_identity(), - ) - - return text_split_skill - + def get_adi_skill(self, chunk_by_page=False) -> WebApiSkill: """Get the custom skill for adi. @@ -400,6 +361,46 @@ def get_adi_skill(self, chunk_by_page=False) -> WebApiSkill: return adi_skill + def get_excel_skill(self) -> WebApiSkill: + """Get the custom skill for adi. + + Returns: + -------- + WebApiSkill: The custom skill for adi""" + + if self.test: + batch_size = 1 + degree_of_parallelism = 4 + else: + batch_size = 1 + degree_of_parallelism = 8 + + output = [ + OutputFieldMappingEntry(name="extracted_content", target_name="pages") + ] + + xlsx_skill = WebApiSkill( + name="XLSX Skill", + description="Skill to generate Markdown from XLSX", + context="/document", + uri=get_custom_skill_function_url("xlsx"), + timeout="PT230S", + batch_size=batch_size, + degree_of_parallelism=degree_of_parallelism, + http_method="POST", + http_headers={}, + inputs=[ + InputFieldMappingEntry( + name="source", source="/document/metadata_storage_path" + ) + ], + outputs=output, + auth_resource_id=get_function_app_authresourceid(), + auth_identity=self.get_user_assigned_managed_identity(), + ) + + return xlsx_skill + def get_key_phrase_extraction_skill(self, context, source) -> WebApiSkill: """Get the key phrase extraction skill. @@ -570,25 +571,21 @@ def get_compass_vector_search(self) -> VectorSearch: Returns: VectorSearch: The vector search configuration """ - vectorizer_name = ( - f"{str(self.indexer_type.value)}-compass-vectorizer{self.suffix}" - ) - algorithim_name = f"{str(self.indexer_type.value)}-hnsw-algorithm{self.suffix}" vector_search = VectorSearch( algorithms=[ - HnswAlgorithmConfiguration(name=algorithim_name), + HnswAlgorithmConfiguration(name=self.algorithm_name), ], profiles=[ VectorSearchProfile( name=self.vector_search_profile_name, - algorithm_configuration_name=algorithim_name, - vectorizer=vectorizer_name, + algorithm_configuration_name=self.algorithm_name, + vectorizer=self.vectorizer_name, ) ], vectorizers=[ CustomVectorizer( - name=vectorizer_name, + name=self.vectorizer_name, custom_web_api_parameters=CustomWebApiParameters( uri=get_custom_skill_function_url("compass"), auth_resource_id=get_function_app_authresourceid(), diff --git a/ai_search_with_adi/deploy.py b/ai_search_with_adi/ai_search/deploy.py similarity index 67% rename from ai_search_with_adi/deploy.py rename to ai_search_with_adi/ai_search/deploy.py index 1b2190b..d533340 100644 --- a/ai_search_with_adi/deploy.py +++ b/ai_search_with_adi/ai_search/deploy.py @@ -1,35 +1,45 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + import argparse -from environment import get_search_endpoint, get_managed_identity_id, get_search_key,get_key_vault_url +from ai_search_with_adi.ai_search.environment import ( + get_search_endpoint, + get_managed_identity_id, + get_search_key, + get_key_vault_url, +) from azure.core.credentials import AzureKeyCredential -from azure.identity import DefaultAzureCredential,ManagedIdentityCredential,EnvironmentCredential +from azure.identity import DefaultAzureCredential from azure.keyvault.secrets import SecretClient from inquiry_document import InquiryDocumentAISearch - def main(args): endpoint = get_search_endpoint() try: - credential = DefaultAzureCredential(managed_identity_client_id =get_managed_identity_id()) + credential = DefaultAzureCredential( + managed_identity_client_id=get_managed_identity_id() + ) # initializing key vault client client = SecretClient(vault_url=get_key_vault_url(), credential=credential) print("Using managed identity credential") except Exception as e: print(e) - credential = ( - AzureKeyCredential(get_search_key(client=client)) - ) + credential = AzureKeyCredential(get_search_key(client=client)) print("Using Azure Key credential") if args.indexer_type == "inquiry": # Deploy the inquiry index index_config = InquiryDocumentAISearch( - endpoint=endpoint, - credential=credential, + endpoint=endpoint, + credential=credential, suffix=args.suffix, - rebuild=args.rebuild, - enable_page_by_chunking=args.enable_page_chunking + rebuild=args.rebuild, + enable_page_by_chunking=args.enable_page_chunking, ) + else: + raise ValueError("Invalid Indexer Type") + index_config.deploy() if args.rebuild: @@ -42,7 +52,7 @@ def main(args): "--indexer_type", type=str, required=True, - help="Type of Indexer want to deploy. inquiry/summary/glossary", + help="Type of Indexer want to deploy.", ) parser.add_argument( "--rebuild", diff --git a/ai_search_with_adi/environment.py b/ai_search_with_adi/ai_search/environment.py similarity index 98% rename from ai_search_with_adi/environment.py rename to ai_search_with_adi/ai_search/environment.py index 7503a68..a17d3a1 100644 --- a/ai_search_with_adi/environment.py +++ b/ai_search_with_adi/ai_search/environment.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + """Module providing environment definition""" import os from dotenv import find_dotenv, load_dotenv diff --git a/ai_search_with_adi/inquiry_document.py b/ai_search_with_adi/ai_search/inquiry_document.py similarity index 66% rename from ai_search_with_adi/inquiry_document.py rename to ai_search_with_adi/ai_search/inquiry_document.py index b70251e..36a55a4 100644 --- a/ai_search_with_adi/inquiry_document.py +++ b/ai_search_with_adi/ai_search/inquiry_document.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + from azure.search.documents.indexes.models import ( SearchFieldDataType, SearchField, @@ -11,7 +14,6 @@ FieldMapping, IndexingParameters, IndexingParametersConfiguration, - BlobIndexerImageAction, SearchIndexerIndexProjections, SearchIndexerIndexProjectionSelector, SearchIndexerIndexProjectionsParameters, @@ -19,10 +21,9 @@ SimpleField, BlobIndexerDataToExtract, IndexerExecutionEnvironment, - BlobIndexerPDFTextRotationAlgorithm, ) from ai_search import AISearch -from environment import ( +from ai_search_with_adi.ai_search.environment import ( get_search_embedding_model_dimensions, IndexerType, ) @@ -47,9 +48,33 @@ def __init__( else: self.enable_page_by_chunking = False - # explicitly setting it to false no matter what output comes in - # might be removed later - # self.enable_page_by_chunking = False + @property + def index_name(self): + """Get the index name for the indexer. Overwritten as this class is subclassed by InquiryDocumentXLSX and they should both point to the same index""" + return f"{str(IndexerType.INQUIRY_DOCUMENT.value)}-index{self.suffix}" + + @property + def vector_search_profile_name(self): + """Get the vector search profile name for the indexer. Overwritten as this class is subclassed by InquiryDocumentXLSX and they should both point to the same index""" + return f"{str(IndexerType.INQUIRY_DOCUMENT.value)}-compass-vector-search-profile{self.suffix}" + + @property + def vectorizer_name(self): + """Get the vectorizer name. Overwritten as this class is subclassed by InquiryDocumentXLSX and they should both point to the same index""" + return ( + f"{str(IndexerType.INQUIRY_DOCUMENT.value)}-compass-vectorizer{self.suffix}" + ) + + @property + def algorithm_name(self): + """Gtt the algorithm name. Overwritten as this class is subclassed by InquiryDocumentXLSX and they should both point to the same index""" + + return f"{str(IndexerType.INQUIRY_DOCUMENT.value)}-hnsw-algorithm{self.suffix}" + + @property + def semantic_config_name(self): + """Get the semantic config name for the indexer. Overwritten as this class is subclassed by InquiryDocumentXLSX and they should both point to the same index""" + return f"{str(IndexerType.INQUIRY_DOCUMENT.value)}-semantic-config{self.suffix}" def get_index_fields(self) -> list[SearchableField]: """This function returns the index fields for inquiry document. @@ -60,42 +85,42 @@ def get_index_fields(self) -> list[SearchableField]: fields = [ SimpleField(name="Id", type=SearchFieldDataType.String, filterable=True), SearchableField( - name="Title", type=SearchFieldDataType.String, filterable=True + name="Field1", type=SearchFieldDataType.String, filterable=True ), SearchableField( - name="ID1", + name="Field2", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True, ), SearchableField( - name="ID2", + name="Field3", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True, ), SearchableField( - name="ChunkId", + name="Field4", type=SearchFieldDataType.String, key=True, - analyzer_name="keyword", + analyzer_name="a1", ), SearchableField( - name="Chunk", + name="Field5", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False, ), SearchableField( - name="Section", + name="Field6", type=SearchFieldDataType.String, collection=True, ), SearchField( - name="ChunkEmbedding", + name="EmbeddingField", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=get_search_embedding_model_dimensions( self.indexer_type @@ -103,17 +128,17 @@ def get_index_fields(self) -> list[SearchableField]: vector_search_profile_name=self.vector_search_profile_name, ), SearchableField( - name="Keywords", type=SearchFieldDataType.String, collection=True + name="Field7", type=SearchFieldDataType.String, collection=True ), SearchableField( - name="SourceUrl", + name="Field8", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True, ), SearchableField( - name="AdditionalMetadata", + name="Field9", type=SearchFieldDataType.String, sortable=True, filterable=True, @@ -125,7 +150,7 @@ def get_index_fields(self) -> list[SearchableField]: fields.extend( [ SearchableField( - name="PageNumber", + name="Field10", type=SearchFieldDataType.Int64, sortable=True, filterable=True, @@ -145,13 +170,13 @@ def get_semantic_search(self) -> SemanticSearch: semantic_config = SemanticConfiguration( name=self.semantic_config_name, prioritized_fields=SemanticPrioritizedFields( - title_field=SemanticField(field_name="Title"), - content_fields=[SemanticField(field_name="Chunk")], + title_field=SemanticField(field_name="Field1"), + content_fields=[SemanticField(field_name="Field2")], keywords_fields=[ - SemanticField(field_name="Keywords"), - SemanticField(field_name="Section"), - ], - ), + SemanticField(field_name="Field3"), + SemanticField(field_name="Field4"), + ], + ), ) semantic_search = SemanticSearch(configurations=[semantic_config]) @@ -163,14 +188,17 @@ def get_skills(self): adi_skill = self.get_adi_skill(self.enable_page_by_chunking) + text_split_skill = self.get_text_split_skill( "/document", "/document/extracted_content/content" ) + pre_embedding_cleaner_skill = self.get_pre_embedding_cleaner_skill( - "/document/pages/*", "/document/pages/*", self.enable_page_by_chunking + "/document/pages/*", "/document/pages/*", self.enable_page_by_chunking ) + key_phrase_extraction_skill = self.get_key_phrase_extraction_skill( "/document/pages/*", "/document/pages/*/cleaned_chunk" ) @@ -199,60 +227,44 @@ def get_skills(self): def get_index_projections(self) -> SearchIndexerIndexProjections: """This function returns the index projections for inquiry document.""" - mappings =[ - InputFieldMappingEntry( - name="Chunk", source="/document/pages/*/chunk" - ), - InputFieldMappingEntry( - name="ChunkEmbedding", - source="/document/pages/*/vector", - ), - InputFieldMappingEntry( - name="Title", - source="/document/Title" - ), - InputFieldMappingEntry( - name="ID1", - source="/document/ID1" - ), - InputFieldMappingEntry( - name="ID2", - source="/document/ID2" - ), - InputFieldMappingEntry( - name="SourceUrl", - source="/document/SourceUrl" - ), - InputFieldMappingEntry( - name="Keywords", - source="/document/pages/*/keywords" - ), - InputFieldMappingEntry( - name="AdditionalMetadata", - source="/document/AdditionalMetadata", - ), - InputFieldMappingEntry( - name="Section", - source="/document/pages/*/eachsection" - ) - ] - + mappings = [ + InputFieldMappingEntry(name="Chunk", source="/document/pages/*/chunk"), + InputFieldMappingEntry( + name="ChunkEmbedding", + source="/document/pages/*/vector", + ), + InputFieldMappingEntry(name="Field1", source="/document/Field1"), + InputFieldMappingEntry(name="Field2", source="/document/Field2"), + InputFieldMappingEntry(name="Field3", source="/document/Field3"), + InputFieldMappingEntry(name="Field4", source="/document/Field4"), + InputFieldMappingEntry( + name="Field5", source="/document/pages/*/Field5" + ), + InputFieldMappingEntry( + name="Field6", + source="/document/Field6", + ), + InputFieldMappingEntry( + name="Field7", source="/document/pages/*/Field7" + ), + ] + if self.enable_page_by_chunking: mappings.extend( [ InputFieldMappingEntry( - name="PageNumber", source="/document/pages/*/page_no" - ) - ] + name="Field8", source="/document/pages/*/Field8" + ) + ] ) - + index_projections = SearchIndexerIndexProjections( selectors=[ SearchIndexerIndexProjectionSelector( target_index_name=self.index_name, parent_key_field_name="Id", source_context="/document/pages/*", - mappings=mappings + mappings=mappings, ), ], parameters=SearchIndexerIndexProjectionsParameters( @@ -277,12 +289,9 @@ def get_indexer(self) -> SearchIndexer: indexer_parameters = IndexingParameters( batch_size=batch_size, configuration=IndexingParametersConfiguration( - # image_action=BlobIndexerImageAction.GENERATE_NORMALIZED_IMAGE_PER_PAGE, data_to_extract=BlobIndexerDataToExtract.ALL_METADATA, query_timeout=None, - # allow_skillset_to_read_file_data=True, execution_environment=IndexerExecutionEnvironment.PRIVATE, - # pdf_text_rotation_algorithm=BlobIndexerPDFTextRotationAlgorithm.DETECT_ANGLES, fail_on_unprocessable_document=False, fail_on_unsupported_content_type=False, index_storage_metadata_only_for_oversized_documents=True, @@ -302,16 +311,16 @@ def get_indexer(self) -> SearchIndexer: FieldMapping( source_field_name="metadata_storage_name", target_field_name="Title" ), - FieldMapping(source_field_name="ID1", target_field_name="ID1"), + FieldMapping(source_field_name="Field1", target_field_name="Field1"), FieldMapping( - source_field_name="ID2", target_field_name="ID2" + source_field_name="Field2", target_field_name="Field2" ), FieldMapping( - source_field_name="SharePointUrl", target_field_name="SourceUrl" + source_field_name="Field3", target_field_name="Field3" ), FieldMapping( - source_field_name="Additional_Metadata", - target_field_name="AdditionalMetadata", + source_field_name="Field4", + target_field_name="Field4", ), ], parameters=indexer_parameters, diff --git a/ai_search_with_adi/function_apps/common/ai_search.py b/ai_search_with_adi/function_apps/common/ai_search.py index 1bba829..eedf27e 100644 --- a/ai_search_with_adi/function_apps/common/ai_search.py +++ b/ai_search_with_adi/function_apps/common/ai_search.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + from azure.search.documents.indexes.aio import SearchIndexerClient, SearchIndexClient from azure.search.documents.aio import SearchClient from azure.search.documents.indexes.models import SynonymMap @@ -75,7 +78,7 @@ async def trigger_indexer(self, indexer_name): logging.error("Unable to run indexer %s", e) async def search_index( - self, index_name, semantic_config, search_text, deal_id=None + self, index_name, semantic_config, search_text, filter_field=None ): """Search the index using the provided search text.""" async with AsyncAzureOpenAI( @@ -98,8 +101,8 @@ async def search_index( fields="ChunkEmbedding", ) - if deal_id: - filter_expression = f"DealId eq '{deal_id}'" + if filter_field: + filter_expression = f"filter_field eq '{filter_field}'" else: filter_expression = None diff --git a/ai_search_with_adi/function_apps/common/delay_processing_exception.py b/ai_search_with_adi/function_apps/common/delay_processing_exception.py new file mode 100644 index 0000000..a8ef226 --- /dev/null +++ b/ai_search_with_adi/function_apps/common/delay_processing_exception.py @@ -0,0 +1,4 @@ +class DelayProcessingException(Exception): + """Exception to delay processing.""" + + pass diff --git a/ai_search_with_adi/function_apps/common/payloads/error.py b/ai_search_with_adi/function_apps/common/payloads/error.py index 49e456e..5a7f443 100644 --- a/ai_search_with_adi/function_apps/common/payloads/error.py +++ b/ai_search_with_adi/function_apps/common/payloads/error.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + from typing import Optional from pydantic import BaseModel, Field, ConfigDict from datetime import datetime, timezone diff --git a/ai_search_with_adi/function_apps/common/payloads/header.py b/ai_search_with_adi/function_apps/common/payloads/header.py index e7a521c..d90e684 100644 --- a/ai_search_with_adi/function_apps/common/payloads/header.py +++ b/ai_search_with_adi/function_apps/common/payloads/header.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + from pydantic import BaseModel, Field, ConfigDict from datetime import datetime, timezone from enum import Enum @@ -15,8 +18,6 @@ class TaskEnum(Enum): PENDING_INDEX_COMPLETION = "pending_index_completion" PENDING_INDEX_TRIGGER = "pending_index_trigger" - PENDING_SUMMARY_GENERATION = "pending_summary_generation" - class Header(BaseModel): """Header model""" diff --git a/ai_search_with_adi/function_apps/common/payloads/payload.py b/ai_search_with_adi/function_apps/common/payloads/payload.py index fb2f4f9..b36f25f 100644 --- a/ai_search_with_adi/function_apps/common/payloads/payload.py +++ b/ai_search_with_adi/function_apps/common/payloads/payload.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + from pydantic import BaseModel, ConfigDict import logging diff --git a/ai_search_with_adi/function_apps/common/payloads/pending_index_completion.py b/ai_search_with_adi/function_apps/common/payloads/pending_index_completion.py index 8aa0335..caf2ade 100644 --- a/ai_search_with_adi/function_apps/common/payloads/pending_index_completion.py +++ b/ai_search_with_adi/function_apps/common/payloads/pending_index_completion.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + from pydantic import BaseModel, Field, ConfigDict from datetime import datetime, timezone from typing import Optional, List @@ -11,12 +14,12 @@ class PendingIndexCompletionBody(BaseModel): """Body model""" indexer: str = Field(..., description="The indexer to trigger") - deal_id: Optional[int] = Field(None, description="The deal ID") + id_field: Optional[int] = Field(None, description="The ID field") blob_storage_url: Optional[str] = Field( ..., description="The URL to the blob storage" ) - deal_name: Optional[str] = Field( - None, description="The text name for the integer deal ID" + id_name: Optional[str] = Field( + None, description="The text name for the integer ID field" ) business_unit: Optional[str] = Field(None, description="The business unit") indexer_start_time: Optional[datetime] = Field( diff --git a/ai_search_with_adi/function_apps/common/payloads/pending_index_trigger.py b/ai_search_with_adi/function_apps/common/payloads/pending_index_trigger.py index 2a519d9..e4fd62b 100644 --- a/ai_search_with_adi/function_apps/common/payloads/pending_index_trigger.py +++ b/ai_search_with_adi/function_apps/common/payloads/pending_index_trigger.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + from pydantic import BaseModel, Field, ConfigDict from typing import Optional, List @@ -10,12 +13,14 @@ class PendingIndexTriggerBody(BaseModel): """Body model""" indexer: str = Field(..., description="The indexer to trigger") - deal_id: Optional[int] = Field(None, description="The deal ID") + ## this field can be defined based on your id field + id_field: Optional[int] = Field(None, description="The ID field") blob_storage_url: str = Field(..., description="The URL to the blob storage") - deal_name: Optional[str] = Field( - None, description="The text name for the integer deal ID" + ## this field can be defined based on your id field + id_name: Optional[str] = Field( + None, description="The text name for the integer ID field" ) - business_unit: Optional[str] = Field(None, description="The business unit") + additional_field: Optional[str] = Field(None, description="Description of additional_field") __config__ = ConfigDict(extra="ignore") diff --git a/ai_search_with_adi/function_apps/common/requirements.txt b/ai_search_with_adi/function_apps/common/requirements.txt new file mode 100644 index 0000000..daa8b89 --- /dev/null +++ b/ai_search_with_adi/function_apps/common/requirements.txt @@ -0,0 +1,11 @@ +azure-storage-blob +azure-servicebus +azure-core +azure-identity +pydantic +pymongo +azure-search +azure-search-documents==11.6.0b4 +openai +aiohttp +motor diff --git a/ai_search_with_adi/function_apps/common/service_bus.py b/ai_search_with_adi/function_apps/common/service_bus.py new file mode 100644 index 0000000..9e95fe8 --- /dev/null +++ b/ai_search_with_adi/function_apps/common/service_bus.py @@ -0,0 +1,46 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import os +import logging +from datetime import datetime, timezone +from azure.identity.aio import DefaultAzureCredential +from azure.servicebus import ServiceBusMessage +from azure.servicebus.aio import ServiceBusClient + + +class ServiceBusHelper: + def __init__(self): + self._client_id = os.environ["FunctionApp__ClientId"] + + self._endpoint = os.environ["ServiceBusTrigger__fullyQualifiedNamespace"] + + async def get_client(self): + credential = DefaultAzureCredential(managed_identity_client_id=self._client_id) + return ServiceBusClient(self._endpoint, credential) + + async def send_message_to_service_bus_queue( + self, queue, payload, enqueue_time=None, retry=False + ): + # update the header + payload.header.last_processed_timestamp = datetime.now(timezone.utc) + payload.header.task = queue + + if retry: + payload.header.retries_remaining -= 1 + try: + service_bus_client = await self.get_client() + async with service_bus_client: + sender = service_bus_client.get_queue_sender(queue_name=queue.value) + + async with sender: + message = ServiceBusMessage( + body=payload.model_dump_json(), + scheduled_enqueue_time_utc=enqueue_time, + ) + await sender.send_messages(message) + logging.info( + f"Sent a message to the Azure Service Bus queue: {queue}" + ) + except Exception as e: + logging.error(f"Failed to send message to the Azure Service Bus queue: {e}") diff --git a/ai_search_with_adi/function_apps/common/storage_account.py b/ai_search_with_adi/function_apps/common/storage_account.py new file mode 100644 index 0000000..ecb4fea --- /dev/null +++ b/ai_search_with_adi/function_apps/common/storage_account.py @@ -0,0 +1,78 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import logging +import os +import tempfile +from azure.storage.blob.aio import BlobServiceClient +from azure.identity import DefaultAzureCredential +import urllib + + +class StorageAccountHelper: + def __init__(self) -> None: + self._client_id = os.environ["FunctionApp__ClientId"] + + self._endpoint = os.environ["StorageAccount__Endpoint"] + + async def get_client(self): + credential = DefaultAzureCredential(managed_identity_client_id=self._client_id) + + return BlobServiceClient(account_url=self._endpoint, credential=credential) + + async def add_metadata_to_blob(self, source: str, container: str, metadata) -> None: + """Add metadata to the business glossary blob.""" + + blob = urllib.parse.unquote_plus(source) + + blob_service_client = await self.get_client() + async with blob_service_client: + async with blob_service_client.get_blob_client( + container=container, blob=blob + ) as blob_client: + await blob_client.set_blob_metadata(metadata) + + logging.info("Metadata Added") + + async def download_blob_to_temp_dir( + self, source: str, container: str, target_file_name + ) -> tuple[str, dict]: + """Download the business glossary file from the Azure Blob Storage.""" + + blob = urllib.parse.unquote_plus(source) + + blob_service_client = await self.get_client() + async with blob_service_client: + async with blob_service_client.get_blob_client( + container=container, blob=blob + ) as blob_client: + blob_download = await blob_client.download_blob() + blob_contents = await blob_download.readall() + + blob_properties = await blob_client.get_blob_properties() + + logging.info("Blob Downloaded") + # Get the temporary directory + temp_dir = tempfile.gettempdir() + + # Define the temporary file path + temp_file_path = os.path.join(temp_dir, target_file_name) + + # Write the blob contents to the temporary file + with open(temp_file_path, "wb") as temp_file: + temp_file.write(blob_contents) + + return temp_file_path, blob_properties.metadata + + async def upload_business_glossary_dataframe(self, df: str, sheet: str) -> str: + """Upload the business glossary dataframe to a JSONL file.""" + json_lines = df.to_json(orient="records", lines=True) + + container = os.environ["StorageAccount__BusinessGlossary__Container"] + blob = f"{sheet}.jsonl" + blob_service_client = await self.get_client() + async with blob_service_client: + async with blob_service_client.get_blob_client( + container=container, blob=blob + ) as blob_client: + await blob_client.upload_blob(json_lines, overwrite=True) diff --git a/ai_search_with_adi/function_apps/indexer/adi_2_aisearch.py b/ai_search_with_adi/function_apps/indexer/adi_2_ai_search.py similarity index 80% rename from ai_search_with_adi/function_apps/indexer/adi_2_aisearch.py rename to ai_search_with_adi/function_apps/indexer/adi_2_ai_search.py index e0542fb..a477a85 100644 --- a/ai_search_with_adi/function_apps/indexer/adi_2_aisearch.py +++ b/ai_search_with_adi/function_apps/indexer/adi_2_ai_search.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + import base64 from azure.core.credentials import AzureKeyCredential from azure.ai.documentintelligence.aio import DocumentIntelligenceClient @@ -13,7 +16,7 @@ from common.storage_account import StorageAccountHelper import concurrent.futures import json - +from openai import AzureOpenAI def crop_image_from_pdf_page(pdf_path, page_number, bounding_box): """ @@ -131,36 +134,77 @@ def update_figure_description(md_content, img_description, idx): return new_md_content -async def understand_image_with_vlm(image_base64): +async def understand_image_with_gptv(image_base64, caption): """ - Sends a base64-encoded image to a VLM (Vision Language Model) endpoint for financial analysis. + Generates a description for an image using the GPT-4V model. - Args: - image_base64 (str): The base64-encoded string representation of the image. + Parameters: + - image_base64 (str): image file. + - caption (str): The caption for the image. Returns: - str: The response from the VLM, which is either a financial analysis or a statement indicating the image is not useful. + - img_description (str): The generated description for the image. """ - # prompt = "Describe the image ONLY IF it is useful for financial analysis. Otherwise, say 'NOT USEFUL IMAGE' and NOTHING ELSE. " - prompt = "Perform financial analysis of the image ONLY IF the image is of graph, chart, flowchart or table. Otherwise, say 'NOT USEFUL IMAGE' and NOTHING ELSE. " - headers = {"Content-Type": "application/json"} - data = {"prompt": prompt, "image": image_base64} - vlm_endpoint = os.environ["AIServices__VLM__Endpoint"] - async with aiohttp.ClientSession() as session: - async with session.post( - vlm_endpoint, headers=headers, json=data, timeout=30 - ) as response: - response_data = await response.json() - response_text = response_data["response"].split("")[0] - - if ( - "not useful for financial analysis" in response_text - or "NOT USEFUL IMAGE" in response_text - ): - return "Irrelevant Image" + + MAX_TOKENS = 2000 + api_key = os.environ["AzureAI_GPT4V_Key"] + api_version = os.environ["AzureAI__GPT4V_Version"] + deployment_name = os.environ["AzureAI__GPT4V_Deployment"] + api_base = os.environ["AzureAI__GPT4V_APIbase"] + + + client = AzureOpenAI( + api_key=api_key, + api_version=api_version, + base_url=f"{api_base}/openai/deployments/{deployment_name}" + ) + + # We send both image caption and the image body to GPTv for better understanding + if caption != "": + response = client.chat.completions.create( + model=deployment_name, + messages=[ + { "role": "system", "content": "You are a helpful assistant." }, + { "role": "user", "content": [ + { + "type": "text", + "text": f"Describe this image (note: it has image caption: {caption}):" + }, + { + "type": "image_base64", + "image_base64": { + "image": image_base64 + } + } + ] } + ], + max_tokens=MAX_TOKENS + ) + else: - return response_text + response = client.chat.completions.create( + model=deployment_name, + messages=[ + { "role": "system", "content": "You are a helpful assistant." }, + { "role": "user", "content": [ + { + "type": "text", + "text": "Describe this image:" + }, + { + "type": "image_base64", + "image_base64": { + "image": image_base64 + } + } + ] } + ], + max_tokens=MAX_TOKENS + ) + img_description = response.choices[0].message.content + + return img_description def pil_image_to_base64(image, image_format="JPEG"): """ @@ -219,7 +263,7 @@ async def process_figures_from_extracted_content( image_base64 = pil_image_to_base64(cropped_image) - img_description += await understand_image_with_vlm(image_base64) + img_description += await understand_image_with_gptv(image_base64) logging.info(f"\tDescription of figure {idx}: {img_description}") markdown_content = update_figure_description( @@ -385,46 +429,31 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) -> } try: - if chunk_by_page: - markdown_content,page_no = create_page_wise_content(result) - else: - markdown_content = result.content - - # Remove this line when VLM is ready - content_with_figures = markdown_content - - # if chunk_by_page: - # tasks = [ - # process_figures_from_extracted_content( - # temp_file_path, page_content, result.figures, page_number=idx - # ) - # for idx, page_content in enumerate(markdown_content) - # ] - # content_with_figures = await asyncio.gather(*tasks) - # else: - # content_with_figures = await process_figures_from_extracted_content( - # temp_file_path, markdown_content, result.figures - # ) - - # Remove remove_irrelevant_figures=True when VLM is ready if chunk_by_page: cleaned_result = [] + markdown_content,page_no = create_page_wise_content(result) + tasks = [ + process_figures_from_extracted_content( + temp_file_path, page_content, result.figures, page_number=idx + ) + for idx, page_content in enumerate(markdown_content) + ] + content_with_figures = await asyncio.gather(*tasks) with concurrent.futures.ProcessPoolExecutor() as executor: - results = executor.map(clean_adi_markdown,content_with_figures, page_no,[False] * len(content_with_figures)) - - for cleaned_content in results: - cleaned_result.append(cleaned_content) - - # with concurrent.futures.ProcessPoolExecutor() as executor: - # futures = { - # executor.submit( - # clean_adi_markdown, page_content, False - # ): page_content - # for page_content in content_with_figures - # } - # for future in concurrent.futures.as_completed(futures): - # cleaned_result.append(future.result()) + futures = { + executor.submit( + clean_adi_markdown, page_content, False + ): page_content + for page_content in content_with_figures + } + for future in concurrent.futures.as_completed(futures): + cleaned_result.append(future.result()) + else: + markdown_content = result.content + content_with_figures = await process_figures_from_extracted_content( + temp_file_path, markdown_content, result.figures + ) cleaned_result = clean_adi_markdown( content_with_figures, page_no=-1,remove_irrelevant_figures=False ) diff --git a/ai_search_with_adi/function_apps/indexer/function_app.py b/ai_search_with_adi/function_apps/indexer/function_app.py index 12d5d5b..6057ec7 100644 --- a/ai_search_with_adi/function_apps/indexer/function_app.py +++ b/ai_search_with_adi/function_apps/indexer/function_app.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + from datetime import datetime, timedelta, timezone import azure.functions as func import logging diff --git a/ai_search_with_adi/function_apps/indexer/key_phrase_extraction.py b/ai_search_with_adi/function_apps/indexer/key_phrase_extraction.py index c6ab40e..d8c023b 100644 --- a/ai_search_with_adi/function_apps/indexer/key_phrase_extraction.py +++ b/ai_search_with_adi/function_apps/indexer/key_phrase_extraction.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + import logging import json import os diff --git a/ai_search_with_adi/function_apps/indexer/ocr.py b/ai_search_with_adi/function_apps/indexer/ocr.py new file mode 100644 index 0000000..e179eb1 --- /dev/null +++ b/ai_search_with_adi/function_apps/indexer/ocr.py @@ -0,0 +1,86 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import logging +import os +from azure.ai.vision.imageanalysis.aio import ImageAnalysisClient +from azure.ai.vision.imageanalysis.models import VisualFeatures +from azure.core.credentials import AzureKeyCredential + + +async def process_ocr(record: dict) -> dict: + logging.info("Python HTTP trigger function processed a request.") + + try: + url = record["data"]["image"]["url"] + logging.info(f"Request Body: {record}") + except KeyError: + return { + "recordId": record["recordId"], + "data": {}, + "errors": [ + { + "message": "Failed to extract data with ocr. Pass a valid source in the request body.", + } + ], + "warnings": None, + } + else: + logging.info(f"image url: {url}") + + if url is not None: + try: + client = ImageAnalysisClient( + endpoint=os.environ["AIService__Services__Endpoint"], + credential=AzureKeyCredential( + os.environ["AIService__Services__Key"] + ), + ) + result = await client.analyze_from_url( + image_url=url, visual_features=[VisualFeatures.READ] + ) + logging.info("logging output") + + # Extract text from OCR results + text = " ".join([line.text for line in result.read.blocks[0].lines]) + logging.info(text) + + except KeyError as e: + logging.error(e) + logging.error(f"Failed to authenticate with ocr: {e}") + return { + "recordId": record["recordId"], + "data": {}, + "errors": [ + { + "message": f"Failed to authenticate with Ocr. Check the service credentials exist. {e}", + } + ], + "warnings": None, + } + except Exception as e: + logging.error(e) + logging.error( + f"Failed to analyze the document with Azure Document Intelligence: {e}" + ) + logging.error(e.InnerError) + return { + "recordId": record["recordId"], + "data": {}, + "errors": [ + { + "message": f"Failed to analyze the document with ocr. Check the source and try again. {e}", + } + ], + "warnings": None, + } + else: + return { + "recordId": record["recordId"], + "data": {"text": ""}, + } + + return { + "recordId": record["recordId"], + "data": {"text": text}, + } diff --git a/ai_search_with_adi/function_apps/indexer/pending_index_completion.py b/ai_search_with_adi/function_apps/indexer/pending_index_completion.py index e69de29..3488488 100644 --- a/ai_search_with_adi/function_apps/indexer/pending_index_completion.py +++ b/ai_search_with_adi/function_apps/indexer/pending_index_completion.py @@ -0,0 +1,107 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +from common.ai_search import AISearchHelper, IndexerStatusEnum +from common.service_bus import ServiceBusHelper +from common.payloads.pending_index_completion import PendingIndexCompletionPayload +from common.payloads.pending_index_trigger import PendingIndexTriggerPayload +from common.payloads.header import TaskEnum, DataTypeEnum +from common.payloads.error import Error +from datetime import datetime, timedelta, timezone +from common.delay_processing_exception import DelayProcessingException +import asyncio + + +async def process_pending_index_completion(payload: PendingIndexCompletionPayload): + """Process the pending index completion.""" + ai_search_helper = AISearchHelper() + service_bus_helper = ServiceBusHelper() + + status, indexer_start_time = await ai_search_helper.get_indexer_status( + payload.body.indexer + ) + request_time = payload.header.creation_timestamp + enqueue_time = None + queue = None + messages = [] + retry = False + + if status == IndexerStatusEnum.RETRIGGER and payload.header.retries_remaining > 0: + # Trigger the indexer + await ai_search_helper.trigger_indexer(payload.body.indexer) + + errors = [error_item.model_dump() for error_item in payload.errors] + errors.append( + Error( + code="IndexerNotCompleted", + message="Indexer was was in failed state and required retriggering.", + ) + ) + messages.append( + PendingIndexCompletionPayload( + header=payload.header.model_dump(), + body=payload.body.model_dump(), + errors=errors, + ) + ) + queue = TaskEnum.PENDING_INDEX_COMPLETION + minutes = 2 ** (11 - payload.header.retries_remaining) + enqueue_time = datetime.now(timezone.utc) + timedelta(minutes=minutes) + retry = True + elif status == IndexerStatusEnum.RUNNING and payload.header.retries_remaining > 0: + errors = [error_item.model_dump() for error_item in payload.errors] + errors.append( + Error( + code="IndexerNotCompleted", + message="Indexer was completed not at the time of running.", + ) + ) + messages.append( + PendingIndexCompletionPayload( + header=payload.header.model_dump(), + body=payload.body.model_dump(), + errors=errors, + ) + ) + queue = TaskEnum.PENDING_INDEX_COMPLETION + minutes = 2 ** (11 - payload.header.retries_remaining) + enqueue_time = datetime.now(timezone.utc) + timedelta(minutes=minutes) + retry = True + elif ( + status == IndexerStatusEnum.SUCCESS + and indexer_start_time <= request_time + and payload.header.retries_remaining > 0 + ): + errors = [error_item.model_dump() for error_item in payload.errors] + errors.append( + Error( + code="IndexerNotTriggered", + message="Indexer was not triggered.", + ) + ) + messages.append( + PendingIndexTriggerPayload( + header=payload.header.model_dump(), + body=payload.body.model_dump(), + errors=errors, + ) + ) + queue = TaskEnum.PENDING_INDEX_TRIGGER + minutes = 2 ** (11 - payload.header.retries_remaining) + enqueue_time = datetime.now(timezone.utc) + timedelta(minutes=minutes) + retry = True + else: + raise DelayProcessingException( + "Failed to run trigger due to maximum retries exceeded." + ) + + if queue is not None and len(messages) > 0: + message_tasks = [] + for message in messages: + message_tasks.append( + service_bus_helper.send_message_to_service_bus_queue( + queue, message, enqueue_time=enqueue_time, retry=retry + ) + ) + + await asyncio.gather(*message_tasks) diff --git a/ai_search_with_adi/function_apps/indexer/pending_index_trigger.py b/ai_search_with_adi/function_apps/indexer/pending_index_trigger.py new file mode 100644 index 0000000..f803623 --- /dev/null +++ b/ai_search_with_adi/function_apps/indexer/pending_index_trigger.py @@ -0,0 +1,94 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +from common.ai_search import AISearchHelper, IndexerStatusEnum +from common.service_bus import ServiceBusHelper +from common.payloads.pending_index_trigger import PendingIndexTriggerPayload +from common.payloads.pending_index_completion import PendingIndexCompletionPayload +from common.payloads.header import TaskEnum +from datetime import datetime, timedelta, timezone +from common.delay_processing_exception import DelayProcessingException +from common.payloads.error import Error + + +async def process_pending_index_trigger(payload: PendingIndexTriggerPayload): + """Process the pending index trigger.""" + + ai_search_helper = AISearchHelper() + service_bus_helper = ServiceBusHelper() + + status, indexer_start_time = await ai_search_helper.get_indexer_status( + payload.body.indexer + ) + request_time = payload.header.last_processed_timestamp + enqueue_time = None + queue = None + message = None + retry = False + + if status == IndexerStatusEnum.SUCCESS and indexer_start_time > request_time: + errors = [error_item.model_dump() for error_item in payload.errors] + message = PendingIndexCompletionPayload( + header=payload.header.model_dump(), + body=payload.body.model_dump(), + errors=errors, + ) + queue = TaskEnum.PENDING_INDEX_COMPLETION + elif status == IndexerStatusEnum.RETRIGGER or status == IndexerStatusEnum.SUCCESS: + # Trigger the indexer + await ai_search_helper.trigger_indexer(payload.body.indexer) + + errors = [error_item.model_dump() for error_item in payload.errors] + + if status == IndexerStatusEnum.RETRIGGER: + errors.append( + Error( + code="IndexerNotCompleted", + message="Indexer was was in failed state and required retriggering.", + ) + ) + + message = PendingIndexCompletionPayload( + header=payload.header.model_dump(), + body=payload.body.model_dump(), + errors=errors, + ) + queue = TaskEnum.PENDING_INDEX_COMPLETION + elif status == IndexerStatusEnum.RUNNING and indexer_start_time > request_time: + errors = [error_item.model_dump() for error_item in payload.errors] + message = PendingIndexCompletionPayload( + header=payload.header.model_dump(), + body=payload.body.model_dump(), + errors=errors, + ) + queue = TaskEnum.PENDING_INDEX_COMPLETION + elif ( + status == IndexerStatusEnum.RUNNING + and indexer_start_time <= request_time + and payload.header.retries_remaining > 0 + ): + errors = [error_item.model_dump() for error_item in payload.errors] + errors.append( + Error( + code="IndexerAlreadyRunning", + message="Indexer is already running for an outstanding request.", + ) + ) + message = PendingIndexTriggerPayload( + header=payload.header.model_dump(), + body=payload.body.model_dump(), + errors=errors, + ) + queue = TaskEnum.PENDING_INDEX_TRIGGER + minutes = 2 ** (11 - payload.header.retries_remaining) + enqueue_time = datetime.now(timezone.utc) + timedelta(minutes=minutes) + retry = True + else: + raise DelayProcessingException( + "Failed to run trigger due to maximum retries exceeded." + ) + + if queue is not None: + await service_bus_helper.send_message_to_service_bus_queue( + queue, message, enqueue_time=enqueue_time, retry=retry + ) diff --git a/ai_search_with_adi/function_apps/indexer/pre_embedding_cleaner.py b/ai_search_with_adi/function_apps/indexer/pre_embedding_cleaner.py index 2fdf87a..79cbaae 100644 --- a/ai_search_with_adi/function_apps/indexer/pre_embedding_cleaner.py +++ b/ai_search_with_adi/function_apps/indexer/pre_embedding_cleaner.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + import logging import json import string diff --git a/ai_search_with_adi/function_apps/indexer/text_split.py b/ai_search_with_adi/function_apps/indexer/text_split.py new file mode 100644 index 0000000..8121c70 --- /dev/null +++ b/ai_search_with_adi/function_apps/indexer/text_split.py @@ -0,0 +1,355 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import spacy +import logging +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM +import json +from sklearn.metrics.pairwise import cosine_similarity + +nlp = spacy.load("en_core_web_md") + + +class RecursiveCharacterTextSplitter: + def __init__(self, fragment_size=100, division_chars=["\n\n", "\n", " ", ""]): + self.fragment_size = fragment_size + self.division_chars = division_chars + + def split_text(self, text): + return self._recursive_split(text, 0) + + def _recursive_split(self, text, char_idx): + if len(text) <= self.fragment_size or char_idx >= len(self.division_chars): + return [text] + + char = self.division_chars[char_idx] + fragments = text.split(char) + result = [] + current_fragment = "" + + for fragment in fragments: + if len(current_fragment) + len(fragment) + len(char) <= self.fragment_size: + current_fragment += char + fragment + else: + if current_fragment: + result.append(current_fragment) + current_fragment = fragment + + if current_fragment: + result.append(current_fragment) + + if any(len(frag) > self.fragment_size for frag in result): + return self._recursive_split(text, char_idx + 1) + + return result + + +class CharacterTextSplitter: + def __init__(self, fragment_size=100, separator=" "): + self.fragment_size = fragment_size + self.separator = separator + + def split_text(self, text): + fragments = text.split(self.separator) + result = [] + current_fragment = "" + + for fragment in fragments: + if ( + len(current_fragment) + len(fragment) + len(self.separator) + <= self.fragment_size + ): + current_fragment += self.separator + fragment + else: + if current_fragment: + result.append(current_fragment) + current_fragment = fragment + + if current_fragment: + result.append(current_fragment) + + return result + + +class RecursiveTextSplitter: + def __init__(self, fragment_size=100, division_tokens=["\n\n", "\n", " ", ""]): + self.fragment_size = fragment_size + self.division_tokens = division_tokens + + def split_text(self, text): + return self._recursive_split(text, 0) + + def _recursive_split(self, text, token_idx): + if len(text) <= self.fragment_size or token_idx >= len(self.division_tokens): + return [text] + + token = self.division_tokens[token_idx] + fragments = text.split(token) + result = [] + current_fragment = "" + + for fragment in fragments: + if len(current_fragment) + len(fragment) + len(token) <= self.fragment_size: + current_fragment += token + fragment + else: + if current_fragment: + result.append(current_fragment) + current_fragment = fragment + + if current_fragment: + result.append(current_fragment) + + if any(len(frag) > self.fragment_size for frag in result): + return self._recursive_split(text, token_idx + 1) + + return result + + +class SemanticDoubleMergingSplitterNodeParser: + def __init__( + self, + initial_threshold=0.8, + appending_threshold=0.7, + merging_threshold=0.75, + fragment_size=100, + spacy_model="en_core_web_md", + ): + self.initial_threshold = initial_threshold + self.appending_threshold = appending_threshold + self.merging_threshold = merging_threshold + self.fragment_size = fragment_size + try: + self.nlp = spacy.load(spacy_model) + except IOError: + raise ValueError( + f"Spacy model '{spacy_model}' not found. Please download it using 'python -m spacy download {spacy_model}'" + ) + + def split_text(self, text): + sentences = self._split_into_sentences(text) + initial_chunks = self._initial_pass(sentences) + final_chunks = self._second_pass(initial_chunks) + return final_chunks + + def _split_into_sentences(self, text): + doc = self.nlp(text) + sentences = [sent.text for sent in doc.sents] + return sentences + + def _initial_pass(self, sentences): + chunks = [] + current_chunk = [] + + i = 0 + while i < len(sentences): + current_chunk.append(sentences[i]) + if len(current_chunk) >= 2: + cosine_sim = self._cosine_similarity( + " ".join(current_chunk[-2:]), sentences[i] + ) + if ( + cosine_sim < self.initial_threshold + or len(" ".join(current_chunk)) > self.fragment_size + ): + if len(current_chunk) > 2: + chunks.append(" ".join(current_chunk[:-1])) + current_chunk = [current_chunk[-1]] + else: + chunks.append(current_chunk[0]) + current_chunk = [current_chunk[1]] + i += 1 + + if current_chunk: + chunks.append(" ".join(current_chunk)) + + return chunks + + def _second_pass(self, chunks): + merged_chunks = [] + current_chunk = chunks[0] + + i = 1 + while i < len(chunks): + cosine_sim = self._cosine_similarity(current_chunk, chunks[i]) + if ( + cosine_sim >= self.merging_threshold + and len(current_chunk + " " + chunks[i]) <= self.fragment_size + ): + current_chunk += " " + chunks[i] + else: + merged_chunks.append(current_chunk) + current_chunk = chunks[i] + i += 1 + + merged_chunks.append(current_chunk) + return merged_chunks + + def _cosine_similarity(self, text1, text2): + vec1 = self.nlp(text1).vector + vec2 = self.nlp(text2).vector + return cosine_similarity([vec1], [vec2])[0, 0] + + +class FlanT5Chunker: + def __init__( + self, model_name="chentong00/propositionizer-wiki-flan-t5-large", device="cpu" + ): + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device) + self.device = device + self.max_length = 512 # Model's maximum token length + + def flan_t5_chunking(self, text, chunk_size=500, stride=20): + input_text = f"Title: . Section: . Content: {text}" + input_ids = self.tokenizer(input_text, return_tensors="pt").input_ids.to( + self.device + ) + total_length = input_ids.shape[1] + + chunks = [] + for i in range(0, total_length, chunk_size - stride): + end = min(i + chunk_size, total_length) + chunk_input_ids = input_ids[:, i:end] + outputs = self.model.generate( + chunk_input_ids, max_new_tokens=self.max_length + ).cpu() + output_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True) + try: + prop_list = json.loads(output_text) + except json.JSONDecodeError: + prop_list = [] + print("[ERROR] Failed to parse output text as JSON.") + chunks.append(prop_list) + + # Flatten the list of lists + return [item for sublist in chunks for item in sublist] + + +def clean_input(value): + """Clean the input value. + + Args: + value: The input value. + + Returns: + The cleaned value.""" + if isinstance(value, str): + return value.strip('"') + return value + + +async def process_text_split(record: dict, text_split_config: dict) -> dict: + """Process the text split request. + + Args: + record (dict): The request record. + text_split_config (dict): The headers for config. + + Returns: + dict: The response record. + """ + try: + data = record["data"] + text = clean_input(data.get("text")) + logging.info(f"Request Body: {record}") + except KeyError: + return { + "recordId": record["recordId"], + "data": {}, + "errors": [ + { + "message": "Failed to split text. Pass valid parameters.", + } + ], + "warnings": None, + } + else: + if text is None: + logging.error("Failed to split text. Pass valid text.") + return { + "recordId": record["recordId"], + "data": {}, + "errors": [ + { + "message": "Failed to split text. Pass valid text.", + } + ], + "warnings": None, + } + + splitter_type = clean_input( + text_split_config.get("text_split_mode", "recursive_character") + ) + fragment_size = float( + clean_input(text_split_config.get("maximum_page_length", 100)) + ) + separator = clean_input(text_split_config.get("separator", " ")) + initial_threshold = float( + clean_input(text_split_config.get("initial_threshold", 0.8)) + ) + appending_threshold = float( + clean_input(text_split_config.get("appending_threshold", 0.7)) + ) + merging_threshold = float( + clean_input(text_split_config.get("merging_threshold", 0.75)) + ) + + try: + if splitter_type == "recursive_character": + splitter = RecursiveCharacterTextSplitter(fragment_size=fragment_size) + elif splitter_type == "character": + splitter = CharacterTextSplitter( + fragment_size=fragment_size, separator=separator + ) + elif splitter_type == "recursive": + splitter = RecursiveTextSplitter(fragment_size=fragment_size) + elif splitter_type == "semantic": + splitter = SemanticDoubleMergingSplitterNodeParser( + initial_threshold=initial_threshold, + appending_threshold=appending_threshold, + merging_threshold=merging_threshold, + fragment_size=fragment_size, + ) + elif splitter_type == "flan_t5": + splitter = FlanT5Chunker() + else: + logging.error("Failed to split text. Pass valid splitter type.") + logging.error(f"Splitter Type: {splitter_type}") + return { + "recordId": record["recordId"], + "data": {}, + "errors": [ + { + "message": "Failed to split text. Pass valid splitter type.", + } + ], + "warnings": None, + } + + if splitter_type == "flan_t5": + chunks = splitter.flan_t5_chunking(text) + else: + chunks = splitter.split_text(text) + except Exception as e: + logging.error(f"Error during splitting: {e}") + + return { + "recordId": record["recordId"], + "data": {}, + "errors": [ + { + "message": f"Failed to split text. Check function app logs for more details of exact failure. {str(e)}", + } + ], + "warnings": None, + } + + else: + return { + "recordId": record["recordId"], + "data": { + "chunks": chunks, + }, + "errors": None, + "warnings": None, + } From ad8684f9184add86345a492412148f3aa3de900d Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Tue, 10 Sep 2024 11:07:27 +0100 Subject: [PATCH 06/33] Update some of the deployment scripts --- ai_search_with_adi/ai_search/ai_search.py | 315 ++++-------------- ai_search_with_adi/ai_search/deploy.py | 7 +- ai_search_with_adi/ai_search/environment.py | 17 +- .../{inquiry_document.py => rag_documents.py} | 122 ++----- .../function_apps/indexer/adi_2_ai_search.py | 92 ++--- 5 files changed, 157 insertions(+), 396 deletions(-) rename ai_search_with_adi/ai_search/{inquiry_document.py => rag_documents.py} (63%) diff --git a/ai_search_with_adi/ai_search/ai_search.py b/ai_search_with_adi/ai_search/ai_search.py index 6ababd7..d2c9ba5 100644 --- a/ai_search_with_adi/ai_search/ai_search.py +++ b/ai_search_with_adi/ai_search/ai_search.py @@ -1,6 +1,6 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. - +import logging from abc import ABC, abstractmethod from azure.search.documents.indexes.models import ( SearchIndex, @@ -12,8 +12,9 @@ NativeBlobSoftDeleteDeletionDetectionPolicy, HighWaterMarkChangeDetectionPolicy, WebApiSkill, - CustomVectorizer, - CustomWebApiParameters, + AzureOpenAIEmbeddingSkill, + AzureOpenAIVectorizer, + AzureOpenAIParameters, SearchIndexer, SearchIndexerSkillset, SearchIndexerDataContainer, @@ -23,11 +24,8 @@ OutputFieldMappingEntry, InputFieldMappingEntry, SynonymMap, - DocumentExtractionSkill, - OcrSkill, - MergeSkill, - ConditionalSkill, SplitSkill, + SearchIndexerIndexProjections, ) from azure.core.exceptions import HttpResponseError from azure.search.documents.indexes import SearchIndexerClient, SearchIndexClient @@ -37,7 +35,6 @@ get_custom_skill_function_url, get_managed_identity_fqname, get_function_app_authresourceid, - IndexerType, ) @@ -53,7 +50,10 @@ def __init__( Args: endpoint (str): The search endpoint - credential (AzureKeyCredential): The search credential""" + credential (AzureKeyCredential): The search credential + suffix (str, optional): The suffix for the indexer. Defaults to None. + rebuild (bool, optional): Whether to rebuild the index. Defaults to False. + """ self.indexer_type = None if rebuild is not None: @@ -100,20 +100,18 @@ def data_source_name(self): @property def vector_search_profile_name(self): """Get the vector search profile name for the indexer.""" - return ( - f"{str(self.indexer_type.value)}-compass-vector-search-profile{self.suffix}" - ) + return f"{str(self.indexer_type.value)}-vector-search-profile{self.suffix}" @property def vectorizer_name(self): """Get the vectorizer name.""" - return f"{str(self.indexer_type.value)}-compass-vectorizer{self.suffix}" + return f"{str(self.indexer_type.value)}-vectorizer{self.suffix}" @property def algorithm_name(self): - """Gtt the algorithm name""" + """Get the algorithm name""" - return f"{str(self.indexer_type.value)}-hnsw-algorithm{self.suffix}" + return f"{str(self.indexer_type.value)}-algorithm{self.suffix}" @abstractmethod def get_index_fields(self) -> list[SearchableField]: @@ -130,18 +128,21 @@ def get_semantic_search(self) -> SemanticSearch: SemanticSearch: The semantic search configuration""" @abstractmethod - def get_skills(self): - """Get the skillset for the indexer.""" + def get_skills(self) -> list: + """Get the skillset for the indexer. + + Returns: + list: The skillsets used in the indexer""" @abstractmethod def get_indexer(self) -> SearchIndexer: """Get the indexer for the indexer.""" - def get_index_projections(self): + @abstractmethod + def get_index_projections(self) -> SearchIndexerIndexProjections: """Get the index projections for the indexer.""" - return None - def get_synonym_map_names(self): + def get_synonym_map_names(self) -> list[str]: """Get the synonym map names for the indexer.""" return [] @@ -158,12 +159,7 @@ def get_user_assigned_managed_identity( def get_data_source(self) -> SearchIndexerDataSourceConnection: """Get the data source for the indexer.""" - if self.indexer_type == IndexerType.BUSINESS_GLOSSARY: - data_deletion_detection_policy = None - else: - data_deletion_detection_policy = ( - NativeBlobSoftDeleteDeletionDetectionPolicy() - ) + data_deletion_detection_policy = NativeBlobSoftDeleteDeletionDetectionPolicy() data_change_detection_policy = HighWaterMarkChangeDetectionPolicy( high_water_mark_column_name="metadata_storage_last_modified" @@ -185,52 +181,6 @@ def get_data_source(self) -> SearchIndexerDataSourceConnection: return data_source_connection - def get_compass_vector_custom_skill( - self, context, source, target_name="vector" - ) -> WebApiSkill: - """Get the custom skill for compass. - - Args: - ----- - context (str): The context of the skill - source (str): The source of the skill - target_name (str): The target name of the skill - - Returns: - -------- - WebApiSkill: The custom skill for compass""" - - if self.test: - batch_size = 2 - degree_of_parallelism = 2 - else: - batch_size = 4 - degree_of_parallelism = 8 - - embedding_skill_inputs = [ - InputFieldMappingEntry(name="text", source=source), - ] - embedding_skill_outputs = [ - OutputFieldMappingEntry(name="vector", target_name=target_name) - ] - # Limit the number of documents to be processed in parallel to avoid timing out on compass api - embedding_skill = WebApiSkill( - name="Compass Connector API", - description="Skill to generate embeddings via compass API connector", - context=context, - uri=get_custom_skill_function_url("compass"), - timeout="PT230S", - batch_size=batch_size, - degree_of_parallelism=degree_of_parallelism, - http_method="POST", - inputs=embedding_skill_inputs, - outputs=embedding_skill_outputs, - auth_resource_id=get_function_app_authresourceid(), - auth_identity=self.get_user_assigned_managed_identity(), - ) - - return embedding_skill - def get_pre_embedding_cleaner_skill( self, context, source, chunk_by_page=False, target_name="cleaned_chunk" ) -> WebApiSkill: @@ -260,13 +210,15 @@ def get_pre_embedding_cleaner_skill( pre_embedding_cleaner_skill_outputs = [ OutputFieldMappingEntry(name="cleaned_chunk", target_name=target_name), OutputFieldMappingEntry(name="chunk", target_name="chunk"), - OutputFieldMappingEntry(name="section", target_name="eachsection"), + OutputFieldMappingEntry(name="section", target_name="section"), ] if chunk_by_page: pre_embedding_cleaner_skill_outputs.extend( [ - OutputFieldMappingEntry(name="page_number", target_name="page_no"), + OutputFieldMappingEntry( + name="page_number", target_name="page_number" + ), ] ) @@ -313,7 +265,6 @@ def get_text_split_skill(self, context, source) -> SplitSkill: return text_split_skill - def get_adi_skill(self, chunk_by_page=False) -> WebApiSkill: """Get the custom skill for adi. @@ -361,45 +312,32 @@ def get_adi_skill(self, chunk_by_page=False) -> WebApiSkill: return adi_skill - def get_excel_skill(self) -> WebApiSkill: - """Get the custom skill for adi. + def get_vector_skill( + self, context, source, target_name="vector" + ) -> AzureOpenAIEmbeddingSkill: + """Get the vector skill for the indexer. Returns: - -------- - WebApiSkill: The custom skill for adi""" + AzureOpenAIEmbeddingSkill: The vector skill for the indexer""" - if self.test: - batch_size = 1 - degree_of_parallelism = 4 - else: - batch_size = 1 - degree_of_parallelism = 8 - - output = [ - OutputFieldMappingEntry(name="extracted_content", target_name="pages") + embedding_skill_inputs = [ + InputFieldMappingEntry(name="text", source=source), + ] + embedding_skill_outputs = [ + OutputFieldMappingEntry(name="vector", target_name=target_name) ] - xlsx_skill = WebApiSkill( - name="XLSX Skill", - description="Skill to generate Markdown from XLSX", - context="/document", - uri=get_custom_skill_function_url("xlsx"), - timeout="PT230S", - batch_size=batch_size, - degree_of_parallelism=degree_of_parallelism, - http_method="POST", - http_headers={}, - inputs=[ - InputFieldMappingEntry( - name="source", source="/document/metadata_storage_path" - ) - ], - outputs=output, - auth_resource_id=get_function_app_authresourceid(), - auth_identity=self.get_user_assigned_managed_identity(), + vector_skill = AzureOpenAIEmbeddingSkill( + name="Vector Skill", + description="Skill to generate embeddings", + context=context, + deployment_id="0", + model_name="text-embedding-3-large", + inputs=embedding_skill_inputs, + outputs=embedding_skill_outputs, ) - return xlsx_skill + return vector_skill def get_key_phrase_extraction_skill(self, context, source) -> WebApiSkill: """Get the key phrase extraction skill. @@ -443,126 +381,7 @@ def get_key_phrase_extraction_skill(self, context, source) -> WebApiSkill: return key_phrase_extraction_skill - def get_document_extraction_skill(self, context, source) -> DocumentExtractionSkill: - """Get the document extraction utility skill. - - Args: - ----- - context (str): The context of the skill - source (str): The source of the skill - - Returns: - -------- - DocumentExtractionSkill: The document extraction utility skill""" - - doc_extraction_skill = DocumentExtractionSkill( - description="Extraction skill to extract content from office docs like excel, ppt, doc etc", - context=context, - inputs=[InputFieldMappingEntry(name="file_data", source=source)], - outputs=[ - OutputFieldMappingEntry( - name="content", target_name="extracted_content" - ), - OutputFieldMappingEntry( - name="normalized_images", target_name="extracted_normalized_images" - ), - ], - ) - - return doc_extraction_skill - - def get_ocr_skill(self, context, source) -> OcrSkill: - """Get the ocr utility skill - Args: - ----- - context (str): The context of the skill - source (str): The source of the skill - - Returns: - -------- - OcrSkill: The ocr skill""" - - if self.test: - batch_size = 2 - degree_of_parallelism = 2 - else: - batch_size = 2 - degree_of_parallelism = 2 - - ocr_skill_inputs = [ - InputFieldMappingEntry(name="image", source=source), - ] - ocr__skill_outputs = [OutputFieldMappingEntry(name="text", target_name="text")] - ocr_skill = WebApiSkill( - name="ocr API", - description="Skill to extract text from images", - context=context, - uri=get_custom_skill_function_url("ocr"), - timeout="PT230S", - batch_size=batch_size, - degree_of_parallelism=degree_of_parallelism, - http_method="POST", - inputs=ocr_skill_inputs, - outputs=ocr__skill_outputs, - auth_resource_id=get_function_app_authresourceid(), - auth_identity=self.get_user_assigned_managed_identity(), - ) - - return ocr_skill - - def get_merge_skill(self, context, source) -> MergeSkill: - """Get the merge - Args: - ----- - context (str): The context of the skill - source (array): The source of the skill - - Returns: - -------- - mergeSkill: The merge skill""" - - merge_skill = MergeSkill( - description="Merge skill for combining OCR'd and regular text", - context=context, - inputs=[ - InputFieldMappingEntry(name="text", source=source[0]), - InputFieldMappingEntry(name="itemsToInsert", source=source[1]), - InputFieldMappingEntry(name="offsets", source=source[2]), - ], - outputs=[ - OutputFieldMappingEntry(name="mergedText", target_name="merged_content") - ], - ) - - return merge_skill - - def get_conditional_skill(self, context, source) -> ConditionalSkill: - """Get the merge - Args: - ----- - context (str): The context of the skill - source (array): The source of the skill - - Returns: - -------- - ConditionalSkill: The conditional skill""" - - conditional_skill = ConditionalSkill( - description="Select between OCR and Document Extraction output", - context=context, - inputs=[ - InputFieldMappingEntry(name="condition", source=source[0]), - InputFieldMappingEntry(name="whenTrue", source=source[1]), - InputFieldMappingEntry(name="whenFalse", source=source[2]), - ], - outputs=[ - OutputFieldMappingEntry(name="output", target_name="updated_content") - ], - ) - - return conditional_skill - - def get_compass_vector_search(self) -> VectorSearch: + def get_vector_search(self) -> VectorSearch: """Get the vector search configuration for compass. Args: @@ -584,13 +403,9 @@ def get_compass_vector_search(self) -> VectorSearch: ) ], vectorizers=[ - CustomVectorizer( + AzureOpenAIVectorizer( name=self.vectorizer_name, - custom_web_api_parameters=CustomWebApiParameters( - uri=get_custom_skill_function_url("compass"), - auth_resource_id=get_function_app_authresourceid(), - auth_identity=self.get_user_assigned_managed_identity(), - ), + azure_open_ai_parameters=AzureOpenAIParameters(), ), ], ) @@ -601,7 +416,7 @@ def deploy_index(self): """This function deploys index""" index_fields = self.get_index_fields() - vector_search = self.get_compass_vector_search() + vector_search = self.get_vector_search() semantic_search = self.get_semantic_search() index = SearchIndex( name=self.index_name, @@ -613,7 +428,7 @@ def deploy_index(self): self._search_index_client.delete_index(self.index_name) self._search_index_client.create_or_update_index(index) - print(f"{index.name} created") + logging.info("%s index created", index.name) def deploy_skillset(self): """This function deploys the skillset.""" @@ -628,7 +443,8 @@ def deploy_skillset(self): ) self._search_indexer_client.create_or_update_skillset(skillset) - print(f"{skillset.name} created") + + logging.info("%s skillset created", skillset.name) def deploy_data_source(self): """This function deploys the data source.""" @@ -638,9 +454,7 @@ def deploy_data_source(self): data_source ) - print(f"Data source '{result.name}' created or updated") - - return result + logging.info("%s data source created", result.name) def deploy_indexer(self): """This function deploys the indexer.""" @@ -648,33 +462,34 @@ def deploy_indexer(self): result = self._search_indexer_client.create_or_update_indexer(indexer) - print(f"Indexer '{result.name}' created or updated") - - return result + logging.info("%s indexer created", result.name) def run_indexer(self): """This function runs the indexer.""" self._search_indexer_client.run_indexer(self.indexer_name) - print( - f"{self.indexer_name} is running. If queries return no results, please wait a bit and try again." + logging.info( + "%s is running. If queries return no results, please wait a bit and try again.", + self.indexer_name, ) def reset_indexer(self): """This function runs the indexer.""" self._search_indexer_client.reset_indexer(self.indexer_name) - print(f"{self.indexer_name} reset.") + logging.info("%s reset.", self.indexer_name) + + def deploy_synonym_map(self): + """This function deploys the synonym map.""" - def deploy_synonym_map(self) -> list[SearchableField]: synonym_maps = self.get_synonym_map_names() if len(synonym_maps) > 0: for synonym_map in synonym_maps: try: synonym_map = SynonymMap(name=synonym_map, synonyms="") - self._search_index_client.create_synonym_map(synonym_map) - except HttpResponseError: - print("Unable to deploy synonym map as it already exists.") + self._search_index_client.create_or_update_synonym_map(synonym_map) + except HttpResponseError as e: + logging.error("Unable to deploy synonym map. %s", e) def deploy(self): """This function deploys the whole AI search pipeline.""" @@ -684,4 +499,4 @@ def deploy(self): self.deploy_skillset() self.deploy_indexer() - print(f"{str(self.indexer_type.value)} deployed") + logging.info("%s setup deployed", self.indexer_type.value) diff --git a/ai_search_with_adi/ai_search/deploy.py b/ai_search_with_adi/ai_search/deploy.py index d533340..5e1ffb2 100644 --- a/ai_search_with_adi/ai_search/deploy.py +++ b/ai_search_with_adi/ai_search/deploy.py @@ -11,7 +11,8 @@ from azure.core.credentials import AzureKeyCredential from azure.identity import DefaultAzureCredential from azure.keyvault.secrets import SecretClient -from inquiry_document import InquiryDocumentAISearch +from ai_search_with_adi.ai_search.rag_documents import RagDocumentsAISearch + def main(args): endpoint = get_search_endpoint() @@ -28,9 +29,9 @@ def main(args): credential = AzureKeyCredential(get_search_key(client=client)) print("Using Azure Key credential") - if args.indexer_type == "inquiry": + if args.indexer_type == "rag": # Deploy the inquiry index - index_config = InquiryDocumentAISearch( + index_config = RagDocumentsAISearch( endpoint=endpoint, credential=credential, suffix=args.suffix, diff --git a/ai_search_with_adi/ai_search/environment.py b/ai_search_with_adi/ai_search/environment.py index a17d3a1..b806fe6 100644 --- a/ai_search_with_adi/ai_search/environment.py +++ b/ai_search_with_adi/ai_search/environment.py @@ -12,17 +12,17 @@ class IndexerType(Enum): """The type of the indexer""" - INQUIRY_DOCUMENT = "inquiry-document" - SUMMARY_DOCUMENT = "summary-document" - BUSINESS_GLOSSARY = "business-glossary" + RAG_DOCUMENTS = "rag-documents" + # key vault -def get_key_vault_url() ->str: +def get_key_vault_url() -> str: """ This function returns key vault url """ return os.environ.get("KeyVault__Url") + # managed identity id def get_managed_identity_id() -> str: """ @@ -52,12 +52,14 @@ def get_function_app_end_point() -> str: """ return os.environ.get("FunctionApp__Endpoint") + def get_function_app_key() -> str: """ This function returns function app key """ return os.environ.get("FunctionApp__Key") + def get_function_app_compass_function() -> str: """ This function returns function app compass function name @@ -119,10 +121,13 @@ def get_search_key(client) -> str: """ This function returns azure ai search service admin key """ - search_service_key_secret_name = str(os.environ.get("AIService__AzureSearchOptions__name")) + "-PrimaryKey" + search_service_key_secret_name = ( + str(os.environ.get("AIService__AzureSearchOptions__name")) + "-PrimaryKey" + ) retrieved_secret = client.get_secret(search_service_key_secret_name) return retrieved_secret.value + def get_search_key_secret() -> str: """ This function returns azure ai search service admin key @@ -143,12 +148,14 @@ def get_search_embedding_model_dimensions(indexer_type: IndexerType) -> str: f"AIService__AzureSearchOptions__{normalised_indexer_type}__EmbeddingDimensions" ) + def get_blob_connection_string() -> str: """ This function returns azure blob storage connection string """ return os.environ.get("StorageAccount__ConnectionString") + def get_fq_blob_connection_string() -> str: """ This function returns azure blob storage connection string diff --git a/ai_search_with_adi/ai_search/inquiry_document.py b/ai_search_with_adi/ai_search/rag_documents.py similarity index 63% rename from ai_search_with_adi/ai_search/inquiry_document.py rename to ai_search_with_adi/ai_search/rag_documents.py index 36a55a4..8adfe16 100644 --- a/ai_search_with_adi/ai_search/inquiry_document.py +++ b/ai_search_with_adi/ai_search/rag_documents.py @@ -29,8 +29,8 @@ ) -class InquiryDocumentAISearch(AISearch): - """This class is used to deploy the inquiry document index.""" +class RagDocumentsAISearch(AISearch): + """This class is used to deploy the rag document index.""" def __init__( self, @@ -42,40 +42,12 @@ def __init__( ): super().__init__(endpoint, credential, suffix, rebuild) - self.indexer_type = IndexerType.INQUIRY_DOCUMENT + self.indexer_type = IndexerType.RAG_DOCUMENTS if enable_page_by_chunking is not None: self.enable_page_by_chunking = enable_page_by_chunking else: self.enable_page_by_chunking = False - @property - def index_name(self): - """Get the index name for the indexer. Overwritten as this class is subclassed by InquiryDocumentXLSX and they should both point to the same index""" - return f"{str(IndexerType.INQUIRY_DOCUMENT.value)}-index{self.suffix}" - - @property - def vector_search_profile_name(self): - """Get the vector search profile name for the indexer. Overwritten as this class is subclassed by InquiryDocumentXLSX and they should both point to the same index""" - return f"{str(IndexerType.INQUIRY_DOCUMENT.value)}-compass-vector-search-profile{self.suffix}" - - @property - def vectorizer_name(self): - """Get the vectorizer name. Overwritten as this class is subclassed by InquiryDocumentXLSX and they should both point to the same index""" - return ( - f"{str(IndexerType.INQUIRY_DOCUMENT.value)}-compass-vectorizer{self.suffix}" - ) - - @property - def algorithm_name(self): - """Gtt the algorithm name. Overwritten as this class is subclassed by InquiryDocumentXLSX and they should both point to the same index""" - - return f"{str(IndexerType.INQUIRY_DOCUMENT.value)}-hnsw-algorithm{self.suffix}" - - @property - def semantic_config_name(self): - """Get the semantic config name for the indexer. Overwritten as this class is subclassed by InquiryDocumentXLSX and they should both point to the same index""" - return f"{str(IndexerType.INQUIRY_DOCUMENT.value)}-semantic-config{self.suffix}" - def get_index_fields(self) -> list[SearchableField]: """This function returns the index fields for inquiry document. @@ -85,42 +57,28 @@ def get_index_fields(self) -> list[SearchableField]: fields = [ SimpleField(name="Id", type=SearchFieldDataType.String, filterable=True), SearchableField( - name="Field1", type=SearchFieldDataType.String, filterable=True - ), - SearchableField( - name="Field2", - type=SearchFieldDataType.String, - sortable=True, - filterable=True, - facetable=True, - ), - SearchableField( - name="Field3", - type=SearchFieldDataType.String, - sortable=True, - filterable=True, - facetable=True, + name="Title", type=SearchFieldDataType.String, filterable=True ), SearchableField( - name="Field4", + name="ChunkId", type=SearchFieldDataType.String, key=True, - analyzer_name="a1", + analyzer_name="keyword", ), SearchableField( - name="Field5", + name="Chunk", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False, ), SearchableField( - name="Field6", + name="Sections", type=SearchFieldDataType.String, collection=True, ), SearchField( - name="EmbeddingField", + name="ChunkEmbedding", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=get_search_embedding_model_dimensions( self.indexer_type @@ -128,17 +86,10 @@ def get_index_fields(self) -> list[SearchableField]: vector_search_profile_name=self.vector_search_profile_name, ), SearchableField( - name="Field7", type=SearchFieldDataType.String, collection=True + name="Keywords", type=SearchFieldDataType.String, collection=True ), SearchableField( - name="Field8", - type=SearchFieldDataType.String, - sortable=True, - filterable=True, - facetable=True, - ), - SearchableField( - name="Field9", + name="SourceUri", type=SearchFieldDataType.String, sortable=True, filterable=True, @@ -150,7 +101,7 @@ def get_index_fields(self) -> list[SearchableField]: fields.extend( [ SearchableField( - name="Field10", + name="PageNumber", type=SearchFieldDataType.Int64, sortable=True, filterable=True, @@ -170,11 +121,11 @@ def get_semantic_search(self) -> SemanticSearch: semantic_config = SemanticConfiguration( name=self.semantic_config_name, prioritized_fields=SemanticPrioritizedFields( - title_field=SemanticField(field_name="Field1"), - content_fields=[SemanticField(field_name="Field2")], + title_field=SemanticField(field_name="Title"), + content_fields=[SemanticField(field_name="Chunk")], keywords_fields=[ - SemanticField(field_name="Field3"), - SemanticField(field_name="Field4"), + SemanticField(field_name="Keywords"), + SemanticField(field_name="Sections"), ], ), ) @@ -183,27 +134,27 @@ def get_semantic_search(self) -> SemanticSearch: return semantic_search - def get_skills(self): - """This function returns the skills for inquiry document""" + def get_skills(self) -> list: + """Get the skillset for the indexer. - adi_skill = self.get_adi_skill(self.enable_page_by_chunking) + Returns: + list: The skillsets used in the indexer""" + adi_skill = self.get_adi_skill(self.enable_page_by_chunking) text_split_skill = self.get_text_split_skill( "/document", "/document/extracted_content/content" ) - pre_embedding_cleaner_skill = self.get_pre_embedding_cleaner_skill( "/document/pages/*", "/document/pages/*", self.enable_page_by_chunking ) - key_phrase_extraction_skill = self.get_key_phrase_extraction_skill( "/document/pages/*", "/document/pages/*/cleaned_chunk" ) - embedding_skill = self.get_compass_vector_custom_skill( + embedding_skill = self.get_vector_skill( "/document/pages/*", "/document/pages/*/cleaned_chunk" ) @@ -233,19 +184,13 @@ def get_index_projections(self) -> SearchIndexerIndexProjections: name="ChunkEmbedding", source="/document/pages/*/vector", ), - InputFieldMappingEntry(name="Field1", source="/document/Field1"), - InputFieldMappingEntry(name="Field2", source="/document/Field2"), - InputFieldMappingEntry(name="Field3", source="/document/Field3"), - InputFieldMappingEntry(name="Field4", source="/document/Field4"), - InputFieldMappingEntry( - name="Field5", source="/document/pages/*/Field5" - ), + InputFieldMappingEntry(name="Title", source="/document/Title"), + InputFieldMappingEntry(name="SourceUri", source="/document/SourceUri"), InputFieldMappingEntry( - name="Field6", - source="/document/Field6", + name="Keywords", source="/document/pages/*/keywords" ), InputFieldMappingEntry( - name="Field7", source="/document/pages/*/Field7" + name="Sections", source="/document/pages/*/sections" ), ] @@ -253,7 +198,7 @@ def get_index_projections(self) -> SearchIndexerIndexProjections: mappings.extend( [ InputFieldMappingEntry( - name="Field8", source="/document/pages/*/Field8" + name="PageNumber", source="/document/pages/*/page_number" ) ] ) @@ -295,7 +240,7 @@ def get_indexer(self) -> SearchIndexer: fail_on_unprocessable_document=False, fail_on_unsupported_content_type=False, index_storage_metadata_only_for_oversized_documents=True, - indexed_file_name_extensions=".pdf,.pptx,.docx", + indexed_file_name_extensions=".pdf,.pptx,.docx,.xlsx,.txt", ), max_failed_items=5, ) @@ -311,16 +256,9 @@ def get_indexer(self) -> SearchIndexer: FieldMapping( source_field_name="metadata_storage_name", target_field_name="Title" ), - FieldMapping(source_field_name="Field1", target_field_name="Field1"), - FieldMapping( - source_field_name="Field2", target_field_name="Field2" - ), - FieldMapping( - source_field_name="Field3", target_field_name="Field3" - ), FieldMapping( - source_field_name="Field4", - target_field_name="Field4", + source_field_name="metadata_storage_path", + target_field_name="SourceUri", ), ], parameters=indexer_parameters, diff --git a/ai_search_with_adi/function_apps/indexer/adi_2_ai_search.py b/ai_search_with_adi/function_apps/indexer/adi_2_ai_search.py index a477a85..ae0474a 100644 --- a/ai_search_with_adi/function_apps/indexer/adi_2_ai_search.py +++ b/ai_search_with_adi/function_apps/indexer/adi_2_ai_search.py @@ -11,13 +11,13 @@ import fitz from PIL import Image import io -import aiohttp import logging from common.storage_account import StorageAccountHelper import concurrent.futures import json from openai import AzureOpenAI + def crop_image_from_pdf_page(pdf_path, page_number, bounding_box): """ Crops a region from a given page in a PDF and returns it as an image. @@ -41,7 +41,9 @@ def crop_image_from_pdf_page(pdf_path, page_number, bounding_box): return img -def clean_adi_markdown(markdown_text: str, page_no:int,remove_irrelevant_figures=False): +def clean_adi_markdown( + markdown_text: str, page_no: int, remove_irrelevant_figures=False +): """Clean Markdown text extracted by the Azure Document Intelligence service. Args: @@ -73,11 +75,10 @@ def clean_adi_markdown(markdown_text: str, page_no:int,remove_irrelevant_figures comment_patterns = r"||" cleaned_text = re.sub(comment_patterns, "", markdown_text, flags=re.DOTALL) - combined_pattern = r'(.*?)\n===|\n## ?(.*?)\n|\n### ?(.*?)\n' + combined_pattern = r"(.*?)\n===|\n## ?(.*?)\n|\n### ?(.*?)\n" doc_metadata = re.findall(combined_pattern, cleaned_text, re.DOTALL) doc_metadata = [match for group in doc_metadata for match in group if match] - if remove_irrelevant_figures: # Remove irrelevant figures irrelevant_figure_pattern = ( @@ -89,12 +90,12 @@ def clean_adi_markdown(markdown_text: str, page_no:int,remove_irrelevant_figures # Replace ':selected:' with a new line cleaned_text = re.sub(r":(selected|unselected):", "\n", cleaned_text) - output_dict['content'] = cleaned_text - output_dict['section'] = doc_metadata + output_dict["content"] = cleaned_text + output_dict["sections"] = doc_metadata # add page number when chunk by page is enabled - if page_no> -1: - output_dict['page_number'] = page_no + if page_no > -1: + output_dict["page_number"] = page_no return output_dict @@ -152,60 +153,59 @@ async def understand_image_with_gptv(image_base64, caption): deployment_name = os.environ["AzureAI__GPT4V_Deployment"] api_base = os.environ["AzureAI__GPT4V_APIbase"] - client = AzureOpenAI( - api_key=api_key, + api_key=api_key, api_version=api_version, - base_url=f"{api_base}/openai/deployments/{deployment_name}" + base_url=f"{api_base}/openai/deployments/{deployment_name}", ) # We send both image caption and the image body to GPTv for better understanding if caption != "": response = client.chat.completions.create( - model=deployment_name, - messages=[ - { "role": "system", "content": "You are a helpful assistant." }, - { "role": "user", "content": [ - { - "type": "text", - "text": f"Describe this image (note: it has image caption: {caption}):" + model=deployment_name, + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": [ + { + "type": "text", + "text": f"Describe this image (note: it has image caption: {caption}):", }, - { + { "type": "image_base64", - "image_base64": { - "image": image_base64 - } - } - ] } - ], - max_tokens=MAX_TOKENS - ) + "image_base64": {"image": image_base64}, + }, + ], + }, + ], + max_tokens=MAX_TOKENS, + ) else: response = client.chat.completions.create( model=deployment_name, messages=[ - { "role": "system", "content": "You are a helpful assistant." }, - { "role": "user", "content": [ - { - "type": "text", - "text": "Describe this image:" - }, - { - "type": "image_base64", - "image_base64": { - "image": image_base64 - } - } - ] } + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": [ + {"type": "text", "text": "Describe this image:"}, + { + "type": "image_base64", + "image_base64": {"image": image_base64}, + }, + ], + }, ], - max_tokens=MAX_TOKENS + max_tokens=MAX_TOKENS, ) img_description = response.choices[0].message.content - + return img_description + def pil_image_to_base64(image, image_format="JPEG"): """ Converts a PIL image to a base64-encoded string. @@ -293,10 +293,10 @@ def create_page_wise_content(result: AnalyzeResult) -> list: page.spans[0]["offset"] : page.spans[0]["offset"] + page.spans[0]["length"] ] page_wise_content.append(page_content) - page_number+=1 + page_number += 1 page_numbers.append(page_number) - return page_wise_content,page_numbers + return page_wise_content, page_numbers async def analyse_document(file_path: str) -> AnalyzeResult: @@ -431,7 +431,7 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) -> try: if chunk_by_page: cleaned_result = [] - markdown_content,page_no = create_page_wise_content(result) + markdown_content, page_no = create_page_wise_content(result) tasks = [ process_figures_from_extracted_content( temp_file_path, page_content, result.figures, page_number=idx @@ -455,7 +455,7 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) -> temp_file_path, markdown_content, result.figures ) cleaned_result = clean_adi_markdown( - content_with_figures, page_no=-1,remove_irrelevant_figures=False + content_with_figures, page_no=-1, remove_irrelevant_figures=False ) except Exception as e: logging.error(e) From 42adc2ab7ee2e32e5898231b26d701c0aa0bc2b5 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Tue, 10 Sep 2024 13:56:16 +0100 Subject: [PATCH 07/33] Update the deployment script --- ai_search_with_adi/ai_search/ai_search.py | 88 +++++++++++-------- ai_search_with_adi/ai_search/deploy.py | 43 +++------ ai_search_with_adi/ai_search/rag_documents.py | 28 +++--- 3 files changed, 78 insertions(+), 81 deletions(-) diff --git a/ai_search_with_adi/ai_search/ai_search.py b/ai_search_with_adi/ai_search/ai_search.py index d2c9ba5..5527a31 100644 --- a/ai_search_with_adi/ai_search/ai_search.py +++ b/ai_search_with_adi/ai_search/ai_search.py @@ -20,7 +20,6 @@ SearchIndexerDataContainer, SearchIndexerDataSourceConnection, SearchIndexerDataSourceType, - SearchIndexerDataUserAssignedIdentity, OutputFieldMappingEntry, InputFieldMappingEntry, SynonymMap, @@ -29,29 +28,21 @@ ) from azure.core.exceptions import HttpResponseError from azure.search.documents.indexes import SearchIndexerClient, SearchIndexClient -from ai_search_with_adi.ai_search.environment import ( - get_fq_blob_connection_string, - get_blob_container_name, - get_custom_skill_function_url, - get_managed_identity_fqname, - get_function_app_authresourceid, -) +from ai_search_with_adi.ai_search.environment import AISearchEnvironment, IdentityType class AISearch(ABC): + """Handles the deployment of the AI search pipeline.""" + def __init__( self, - endpoint: str, - credential, suffix: str | None = None, rebuild: bool | None = False, ): """Initialize the AI search class Args: - endpoint (str): The search endpoint - credential (AzureKeyCredential): The search credential - suffix (str, optional): The suffix for the indexer. Defaults to None. + suffix (str, optional): The suffix for the indexer. Defaults to None. If an suffix is provided, it is assumed to be a test indexer. rebuild (bool, optional): Whether to rebuild the index. Defaults to False. """ self.indexer_type = None @@ -61,6 +52,7 @@ def __init__( else: self.rebuild = False + # If suffix is None, then it is not a test indexer. Test indexer limits the rate of indexing and turns off the schedule. Useful for testing index changes if suffix is None: self.suffix = "" self.test = False @@ -68,8 +60,14 @@ def __init__( self.suffix = f"-{suffix}-test" self.test = True - self._search_indexer_client = SearchIndexerClient(endpoint, credential) - self._search_index_client = SearchIndexClient(endpoint, credential) + self.environment = AISearchEnvironment(indexer_type=self.indexer_type) + + self._search_indexer_client = SearchIndexerClient( + self.environment.ai_search_endpoint, self.environment.ai_search_credential + ) + self._search_index_client = SearchIndexClient( + self.environment.ai_search_endpoint, self.environment.ai_search_credential + ) @property def indexer_name(self): @@ -94,7 +92,7 @@ def index_name(self): @property def data_source_name(self): """Get the data source name for the indexer.""" - blob_container_name = get_blob_container_name(self.indexer_type) + blob_container_name = self.environment.get_blob_container_name() return f"{blob_container_name}-data-source{self.suffix}" @property @@ -146,16 +144,6 @@ def get_synonym_map_names(self) -> list[str]: """Get the synonym map names for the indexer.""" return [] - def get_user_assigned_managed_identity( - self, - ) -> SearchIndexerDataUserAssignedIdentity: - """Get user assigned managed identity details""" - - user_assigned_identity = SearchIndexerDataUserAssignedIdentity( - user_assigned_identity=get_managed_identity_fqname() - ) - return user_assigned_identity - def get_data_source(self) -> SearchIndexerDataSourceConnection: """Get the data source for the indexer.""" @@ -166,19 +154,21 @@ def get_data_source(self) -> SearchIndexerDataSourceConnection: ) container = SearchIndexerDataContainer( - name=get_blob_container_name(self.indexer_type) + name=self.environment.get_blob_container_name() ) data_source_connection = SearchIndexerDataSourceConnection( name=self.data_source_name, type=SearchIndexerDataSourceType.AZURE_BLOB, - connection_string=get_fq_blob_connection_string(), + connection_string=self.environment.blob_connection_string, container=container, data_change_detection_policy=data_change_detection_policy, data_deletion_detection_policy=data_deletion_detection_policy, - identity=self.get_user_assigned_managed_identity(), ) + if self.environment.identity_type != IdentityType.KEY: + data_source_connection.identity = self.environment.ai_search_identity_id + return data_source_connection def get_pre_embedding_cleaner_skill( @@ -226,17 +216,25 @@ def get_pre_embedding_cleaner_skill( name="Pre Embedding Cleaner Skill", description="Skill to clean the data before sending to embedding", context=context, - uri=get_custom_skill_function_url("pre_embedding_cleaner"), + uri=self.environment.get_custom_skill_function_url("pre_embedding_cleaner"), timeout="PT230S", batch_size=batch_size, degree_of_parallelism=degree_of_parallelism, http_method="POST", inputs=pre_embedding_cleaner_skill_inputs, outputs=pre_embedding_cleaner_skill_outputs, - auth_resource_id=get_function_app_authresourceid(), - auth_identity=self.get_user_assigned_managed_identity(), ) + if self.environment.identity_type != IdentityType.KEY: + pre_embedding_cleaner_skill.auth_identity = ( + self.environment.ai_search_identity_id + ) + + if self.environment.identity_type == IdentityType.USER_ASSIGNED: + pre_embedding_cleaner_skill.auth_resource_id = ( + self.environment.ai_search_user_assigned_identity + ) + return pre_embedding_cleaner_skill def get_text_split_skill(self, context, source) -> SplitSkill: @@ -294,7 +292,7 @@ def get_adi_skill(self, chunk_by_page=False) -> WebApiSkill: name="ADI Skill", description="Skill to generate ADI", context="/document", - uri=get_custom_skill_function_url("adi"), + uri=self.environment.get_custom_skill_function_url("adi"), timeout="PT230S", batch_size=batch_size, degree_of_parallelism=degree_of_parallelism, @@ -306,10 +304,16 @@ def get_adi_skill(self, chunk_by_page=False) -> WebApiSkill: ) ], outputs=output, - auth_resource_id=get_function_app_authresourceid(), - auth_identity=self.get_user_assigned_managed_identity(), ) + if self.environment.identity_type != IdentityType.KEY: + adi_skill.auth_identity = self.environment.ai_search_identity_id + + if self.environment.identity_type == IdentityType.USER_ASSIGNED: + adi_skill.auth_resource_id = ( + self.environment.ai_search_user_assigned_identity + ) + return adi_skill def get_vector_skill( @@ -368,17 +372,25 @@ def get_key_phrase_extraction_skill(self, context, source) -> WebApiSkill: name="Key phrase extraction API", description="Skill to extract keyphrases", context=context, - uri=get_custom_skill_function_url("keyphraseextraction"), + uri=self.environment.get_custom_skill_function_url("key_phrase_extraction"), timeout="PT230S", batch_size=batch_size, degree_of_parallelism=degree_of_parallelism, http_method="POST", inputs=keyphrase_extraction_skill_inputs, outputs=keyphrase_extraction__skill_outputs, - auth_resource_id=get_function_app_authresourceid(), - auth_identity=self.get_user_assigned_managed_identity(), ) + if self.environment.identity_type != IdentityType.KEY: + key_phrase_extraction_skill.auth_identity = ( + self.environment.ai_search_identity_id + ) + + if self.environment.identity_type == IdentityType.USER_ASSIGNED: + key_phrase_extraction_skill.auth_resource_id = ( + self.environment.ai_search_user_assigned_identity + ) + return key_phrase_extraction_skill def get_vector_search(self) -> VectorSearch: diff --git a/ai_search_with_adi/ai_search/deploy.py b/ai_search_with_adi/ai_search/deploy.py index 5e1ffb2..e28a61c 100644 --- a/ai_search_with_adi/ai_search/deploy.py +++ b/ai_search_with_adi/ai_search/deploy.py @@ -1,49 +1,26 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. - import argparse -from ai_search_with_adi.ai_search.environment import ( - get_search_endpoint, - get_managed_identity_id, - get_search_key, - get_key_vault_url, -) -from azure.core.credentials import AzureKeyCredential -from azure.identity import DefaultAzureCredential -from azure.keyvault.secrets import SecretClient from ai_search_with_adi.ai_search.rag_documents import RagDocumentsAISearch -def main(args): - endpoint = get_search_endpoint() - - try: - credential = DefaultAzureCredential( - managed_identity_client_id=get_managed_identity_id() - ) - # initializing key vault client - client = SecretClient(vault_url=get_key_vault_url(), credential=credential) - print("Using managed identity credential") - except Exception as e: - print(e) - credential = AzureKeyCredential(get_search_key(client=client)) - print("Using Azure Key credential") +def deploy_config(arguments: argparse.Namespace): + """Deploy the indexer configuration based on the arguments passed. - if args.indexer_type == "rag": - # Deploy the inquiry index + Args: + arguments (argparse.Namespace): The arguments passed to the script""" + if arguments.indexer_type == "rag": index_config = RagDocumentsAISearch( - endpoint=endpoint, - credential=credential, - suffix=args.suffix, - rebuild=args.rebuild, - enable_page_by_chunking=args.enable_page_chunking, + suffix=arguments.suffix, + rebuild=arguments.rebuild, + enable_page_by_chunking=arguments.enable_page_chunking, ) else: raise ValueError("Invalid Indexer Type") index_config.deploy() - if args.rebuild: + if arguments.rebuild: index_config.reset_indexer() @@ -75,4 +52,4 @@ def main(args): ) args = parser.parse_args() - main(args) + deploy_config(args) diff --git a/ai_search_with_adi/ai_search/rag_documents.py b/ai_search_with_adi/ai_search/rag_documents.py index 8adfe16..8541478 100644 --- a/ai_search_with_adi/ai_search/rag_documents.py +++ b/ai_search_with_adi/ai_search/rag_documents.py @@ -24,7 +24,6 @@ ) from ai_search import AISearch from ai_search_with_adi.ai_search.environment import ( - get_search_embedding_model_dimensions, IndexerType, ) @@ -34,13 +33,17 @@ class RagDocumentsAISearch(AISearch): def __init__( self, - endpoint, - credential, - suffix=None, - rebuild=False, + suffix: str | None = None, + rebuild: bool | None = False, enable_page_by_chunking=False, ): - super().__init__(endpoint, credential, suffix, rebuild) + """Initialize the RagDocumentsAISearch class. This class implements the deployment of the rag document index. + + Args: + suffix (str, optional): The suffix for the indexer. Defaults to None. If an suffix is provided, it is assumed to be a test indexer. + rebuild (bool, optional): Whether to rebuild the index. Defaults to False. + """ + super().__init__(suffix, rebuild) self.indexer_type = IndexerType.RAG_DOCUMENTS if enable_page_by_chunking is not None: @@ -80,9 +83,7 @@ def get_index_fields(self) -> list[SearchableField]: SearchField( name="ChunkEmbedding", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), - vector_search_dimensions=get_search_embedding_model_dimensions( - self.indexer_type - ), + vector_search_dimensions=self.environment.embedding_model_dimensions, vector_search_profile_name=self.vector_search_profile_name, ), SearchableField( @@ -224,6 +225,8 @@ def get_indexer(self) -> SearchIndexer: Returns: SearchIndexer: The indexer for inquiry document""" + + # Only place on schedule if it is not a test deployment if self.test: schedule = None batch_size = 4 @@ -231,12 +234,17 @@ def get_indexer(self) -> SearchIndexer: schedule = {"interval": "PT15M"} batch_size = 16 + if self.environment.use_private_endpoint: + execution_environment = IndexerExecutionEnvironment.PRIVATE + else: + execution_environment = IndexerExecutionEnvironment.STANDARD + indexer_parameters = IndexingParameters( batch_size=batch_size, configuration=IndexingParametersConfiguration( data_to_extract=BlobIndexerDataToExtract.ALL_METADATA, query_timeout=None, - execution_environment=IndexerExecutionEnvironment.PRIVATE, + execution_environment=execution_environment, fail_on_unprocessable_document=False, fail_on_unsupported_content_type=False, index_storage_metadata_only_for_oversized_documents=True, From 3238e64f6283d24e6252f978f404c8dea50b610c Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Tue, 10 Sep 2024 14:11:27 +0100 Subject: [PATCH 08/33] Temp update of code --- ai_search_with_adi/ai_search/ai_search.py | 4 +- ai_search_with_adi/ai_search/environment.py | 60 +++++++++++++++++---- 2 files changed, 53 insertions(+), 11 deletions(-) diff --git a/ai_search_with_adi/ai_search/ai_search.py b/ai_search_with_adi/ai_search/ai_search.py index 5527a31..baadb62 100644 --- a/ai_search_with_adi/ai_search/ai_search.py +++ b/ai_search_with_adi/ai_search/ai_search.py @@ -154,13 +154,13 @@ def get_data_source(self) -> SearchIndexerDataSourceConnection: ) container = SearchIndexerDataContainer( - name=self.environment.get_blob_container_name() + name=self.environment.storage_account_blob_container_name ) data_source_connection = SearchIndexerDataSourceConnection( name=self.data_source_name, type=SearchIndexerDataSourceType.AZURE_BLOB, - connection_string=self.environment.blob_connection_string, + connection_string=self.environment.storage_account_connection_string, container=container, data_change_detection_policy=data_change_detection_policy, data_deletion_detection_policy=data_deletion_detection_policy, diff --git a/ai_search_with_adi/ai_search/environment.py b/ai_search_with_adi/ai_search/environment.py index b806fe6..d04c660 100644 --- a/ai_search_with_adi/ai_search/environment.py +++ b/ai_search_with_adi/ai_search/environment.py @@ -6,23 +6,65 @@ from dotenv import find_dotenv, load_dotenv from enum import Enum -load_dotenv(find_dotenv()) - class IndexerType(Enum): """The type of the indexer""" RAG_DOCUMENTS = "rag-documents" +class IdentityType(Enum): + """The type of the indexer""" -# key vault -def get_key_vault_url() -> str: - """ - This function returns key vault url - """ - return os.environ.get("KeyVault__Url") - + USER_ASSIGNED = "user_assigned" + SYSTEM_ASSIGNED = "system_assigned" + KEY = "key" + +class AISearchEnvironment: + def __init__(self, indexer_type: IndexerType): + """Initialize the AISearchEnvironment class. + + Args: + indexer_type (IndexerType): The type of the indexer + """ + load_dotenv(find_dotenv()) + self.indexer_type = indexer_type + + @property + def normalised_indexer_type(self): + """This function returns the normalised indexer type""" + normalised_indexer_type = ( + self.indexer_type.value.replace("-", " ").title().replace(" ", "") + ) + return normalised_indexer_type + + @property + def identity_type(self): + """This function returns the identity type""" + type = os.environ.get("AIService__AzureSearchOptions__IdentityType") + + if type == "user_assigned": + return IdentityType.USER_ASSIGNED + elif type == "system_assigned": + return IdentityType.SYSTEM_ASSIGNED + elif type == "key": + return IdentityType.KEY + + @property + def storage_account_connection_string(self) -> str: + """This function returns the blob connection string. If the identity type is user_assigned or system_assigned, it returns the FQEndpoint, otherwise it returns the ConnectionString""" + if self.identity_type in [IdentityType.SYSTEM_ASSIGNED, IdentityType.USER_ASSIGNED]: + return os.environ.get("StorageAccount__FQEndpoint") + else: + return os.environ.get("StorageAccount__ConnectionString") + + @property + def storage_account_blob_container_name(self) -> str: + """ + This function returns azure blob container name + """ + + return os.environ.get(f"StorageAccount__{self.normalised_indexer_type}__Container") # managed identity id def get_managed_identity_id() -> str: """ From 666203eb80bf5cc0789fd049dc4942e596025cb0 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Tue, 10 Sep 2024 14:36:18 +0100 Subject: [PATCH 09/33] Temp update of code --- ai_search_with_adi/ai_search/ai_search.py | 6 +- ai_search_with_adi/ai_search/environment.py | 210 +++++++++----------- 2 files changed, 102 insertions(+), 114 deletions(-) diff --git a/ai_search_with_adi/ai_search/ai_search.py b/ai_search_with_adi/ai_search/ai_search.py index baadb62..1cbca3b 100644 --- a/ai_search_with_adi/ai_search/ai_search.py +++ b/ai_search_with_adi/ai_search/ai_search.py @@ -92,7 +92,7 @@ def index_name(self): @property def data_source_name(self): """Get the data source name for the indexer.""" - blob_container_name = self.environment.get_blob_container_name() + blob_container_name = self.environment.storage_account_blob_container_name return f"{blob_container_name}-data-source{self.suffix}" @property @@ -166,8 +166,8 @@ def get_data_source(self) -> SearchIndexerDataSourceConnection: data_deletion_detection_policy=data_deletion_detection_policy, ) - if self.environment.identity_type != IdentityType.KEY: - data_source_connection.identity = self.environment.ai_search_identity_id + # if self.environment.identity_type != IdentityType.KEY: + # data_source_connection.identity = self.environment.ai_search_identity_id return data_source_connection diff --git a/ai_search_with_adi/ai_search/environment.py b/ai_search_with_adi/ai_search/environment.py index d04c660..a4188ed 100644 --- a/ai_search_with_adi/ai_search/environment.py +++ b/ai_search_with_adi/ai_search/environment.py @@ -1,11 +1,10 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. - -"""Module providing environment definition""" import os from dotenv import find_dotenv, load_dotenv from enum import Enum - +from azure.identity import DefaultAzureCredential +from azure.core.credentials import AzureKeyCredential class IndexerType(Enum): """The type of the indexer""" @@ -20,6 +19,7 @@ class IdentityType(Enum): KEY = "key" class AISearchEnvironment: + """This class is used to get the environment variables for the AI search service.""" def __init__(self, indexer_type: IndexerType): """Initialize the AISearchEnvironment class. @@ -30,8 +30,12 @@ def __init__(self, indexer_type: IndexerType): self.indexer_type = indexer_type @property - def normalised_indexer_type(self): - """This function returns the normalised indexer type""" + def normalised_indexer_type(self) -> str: + """This function returns the normalised indexer type. + + Returns: + str: The normalised indexer type + """ normalised_indexer_type = ( self.indexer_type.value.replace("-", " ").title().replace(" ", "") ) @@ -39,16 +43,45 @@ def normalised_indexer_type(self): return normalised_indexer_type @property - def identity_type(self): - """This function returns the identity type""" - type = os.environ.get("AIService__AzureSearchOptions__IdentityType") + def identity_type(self) -> IdentityType: + """This function returns the identity type. + + Returns: + IdentityType: The identity type + """ + identity = os.environ.get("AIService__AzureSearchOptions__IdentityType") - if type == "user_assigned": + if identity == "user_assigned": return IdentityType.USER_ASSIGNED - elif type == "system_assigned": + elif identity == "system_assigned": return IdentityType.SYSTEM_ASSIGNED - elif type == "key": + elif identity == "key": return IdentityType.KEY + else: + raise ValueError("Invalid identity type") + + @property + def ai_search_endpoint(self) -> str: + """This function returns the ai search endpoint. + + Returns: + str: The ai search endpoint + """ + return os.environ.get("AIService__AzureSearchOptions__Endpoint") + + @property + def ai_search_credential(self) -> DefaultAzureCredential | AzureKeyCredential: + """This function returns the ai search credential. + + Returns: + DefaultAzureCredential | AzureKeyCredential: The ai search credential + """ + if self.identity_type in IdentityType.SYSTEM_ASSIGNED: + return DefaultAzureCredential() + elif self.identity_type in IdentityType.USER_ASSIGNED: + return DefaultAzureCredential(managed_identity_client_id =os.environ.get("AIService__AzureSearchOptions__ManagedIdentity__FQName")) + else: + return AzureKeyCredential(os.environ.get("AIService__AzureSearchOptions__Key__Secret")) @property def storage_account_connection_string(self) -> str: @@ -65,6 +98,61 @@ def storage_account_blob_container_name(self) -> str: """ return os.environ.get(f"StorageAccount__{self.normalised_indexer_type}__Container") + + @property + def function_app_end_point(self) -> str: + """ + This function returns function app endpoint + """ + return os.environ.get("FunctionApp__Endpoint") + + @property + def function_app_key(self) -> str: + """ + This function returns function app key + """ + return os.environ.get("FunctionApp__Key") + + @property + def function_app_pre_embedding_cleaner_route(self) -> str: + """ + This function returns function app data cleanup function name + """ + return os.environ.get("FunctionApp__PreEmbeddingCleaner__FunctionName") + + @property + def function_app_adi_route(self) -> str: + """ + This function returns function app adi name + """ + return os.environ.get("FunctionApp__DocumentIntelligence__FunctionName") + + @property + def function_app_key_phrase_extractor_route(self) -> str: + """ + This function returns function app keyphrase extractor name + """ + return os.environ.get("FunctionApp__KeyphraseExtractor__FunctionName") + + def get_custom_skill_function_url(self, skill_type: str): + """ + Get the function app url that is hosting the custom skill + """ + if skill_type == "pre_embedding_cleaner": + route = self.function_app_pre_embedding_cleaner_route + elif skill_type == "adi": + route = self.function_app_adi_route + elif skill_type == "key_phrase_extraction": + route = self.function_app_key_phrase_extractor_route + else: + raise ValueError(f"Invalid skill type: {skill_type}") + + full_url = f"{self.function_app_end_point}/api/{route}?code={self.function_app_key}" + + return full_url + + + # managed identity id def get_managed_identity_id() -> str: """ @@ -87,63 +175,6 @@ def get_function_app_authresourceid() -> str: """ return os.environ.get("FunctionApp__AuthResourceId") - -def get_function_app_end_point() -> str: - """ - This function returns function app endpoint - """ - return os.environ.get("FunctionApp__Endpoint") - - -def get_function_app_key() -> str: - """ - This function returns function app key - """ - return os.environ.get("FunctionApp__Key") - - -def get_function_app_compass_function() -> str: - """ - This function returns function app compass function name - """ - return os.environ.get("FunctionApp__Compass__FunctionName") - - -def get_function_app_pre_embedding_cleaner_function() -> str: - """ - This function returns function app data cleanup function name - """ - return os.environ.get("FunctionApp__PreEmbeddingCleaner__FunctionName") - - -def get_function_app_adi_function() -> str: - """ - This function returns function app adi name - """ - return os.environ.get("FunctionApp__DocumentIntelligence__FunctionName") - - -def get_function_app_custom_split_function() -> str: - """ - This function returns function app adi name - """ - return os.environ.get("FunctionApp__CustomTextSplit__FunctionName") - - -def get_function_app_keyphrase_extractor_function() -> str: - """ - This function returns function app keyphrase extractor name - """ - return os.environ.get("FunctionApp__KeyphraseExtractor__FunctionName") - - -def get_function_app_ocr_function() -> str: - """ - This function returns function app ocr name - """ - return os.environ.get("FunctionApp__Ocr__FunctionName") - - # search def get_search_endpoint() -> str: """ @@ -191,20 +222,6 @@ def get_search_embedding_model_dimensions(indexer_type: IndexerType) -> str: ) -def get_blob_connection_string() -> str: - """ - This function returns azure blob storage connection string - """ - return os.environ.get("StorageAccount__ConnectionString") - - -def get_fq_blob_connection_string() -> str: - """ - This function returns azure blob storage connection string - """ - return os.environ.get("StorageAccount__FQEndpoint") - - def get_blob_container_name(indexer_type: str) -> str: """ This function returns azure blob container name @@ -213,32 +230,3 @@ def get_blob_container_name(indexer_type: str) -> str: indexer_type.value.replace("-", " ").title().replace(" ", "") ) return os.environ.get(f"StorageAccount__{normalised_indexer_type}__Container") - - -def get_custom_skill_function_url(skill_type: str): - """ - Get the function app url that is hosting the custom skill - """ - url = ( - get_function_app_end_point() - + "/api/function_name?code=" - + get_function_app_key() - ) - if skill_type == "compass": - url = url.replace("function_name", get_function_app_compass_function()) - elif skill_type == "pre_embedding_cleaner": - url = url.replace( - "function_name", get_function_app_pre_embedding_cleaner_function() - ) - elif skill_type == "adi": - url = url.replace("function_name", get_function_app_adi_function()) - elif skill_type == "split": - url = url.replace("function_name", get_function_app_custom_split_function()) - elif skill_type == "keyphraseextraction": - url = url.replace( - "function_name", get_function_app_keyphrase_extractor_function() - ) - elif skill_type == "ocr": - url = url.replace("function_name", get_function_app_ocr_function()) - - return url From 5c74a89b809729c7a0e684c34222c870d7e9cdb1 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Tue, 10 Sep 2024 14:51:10 +0100 Subject: [PATCH 10/33] Temp update of code --- ai_search_with_adi/ai_search/ai_search.py | 8 +- ai_search_with_adi/ai_search/environment.py | 123 +++++++----------- ai_search_with_adi/ai_search/rag_documents.py | 2 +- 3 files changed, 51 insertions(+), 82 deletions(-) diff --git a/ai_search_with_adi/ai_search/ai_search.py b/ai_search_with_adi/ai_search/ai_search.py index 1cbca3b..1738d46 100644 --- a/ai_search_with_adi/ai_search/ai_search.py +++ b/ai_search_with_adi/ai_search/ai_search.py @@ -362,10 +362,10 @@ def get_key_phrase_extraction_skill(self, context, source) -> WebApiSkill: batch_size = 16 degree_of_parallelism = 16 - keyphrase_extraction_skill_inputs = [ + key_phrase_extraction_skill_inputs = [ InputFieldMappingEntry(name="text", source=source), ] - keyphrase_extraction__skill_outputs = [ + key_phrase_extraction__skill_outputs = [ OutputFieldMappingEntry(name="keyPhrases", target_name="keywords") ] key_phrase_extraction_skill = WebApiSkill( @@ -377,8 +377,8 @@ def get_key_phrase_extraction_skill(self, context, source) -> WebApiSkill: batch_size=batch_size, degree_of_parallelism=degree_of_parallelism, http_method="POST", - inputs=keyphrase_extraction_skill_inputs, - outputs=keyphrase_extraction__skill_outputs, + inputs=key_phrase_extraction_skill_inputs, + outputs=key_phrase_extraction__skill_outputs, ) if self.environment.identity_type != IdentityType.KEY: diff --git a/ai_search_with_adi/ai_search/environment.py b/ai_search_with_adi/ai_search/environment.py index a4188ed..e4e407b 100644 --- a/ai_search_with_adi/ai_search/environment.py +++ b/ai_search_with_adi/ai_search/environment.py @@ -5,6 +5,7 @@ from enum import Enum from azure.identity import DefaultAzureCredential from azure.core.credentials import AzureKeyCredential +from azure.search.documents.indexes.models import SearchIndexerDataUserAssignedIdentity class IndexerType(Enum): """The type of the indexer""" @@ -68,6 +69,26 @@ def ai_search_endpoint(self) -> str: str: The ai search endpoint """ return os.environ.get("AIService__AzureSearchOptions__Endpoint") + + @property + def ai_search_identity_id(self) -> str: + """This function returns the ai search identity id. + + Returns: + str: The ai search identity id + """ + return os.environ.get("AIService__AzureSearchOptions__Identity__ClientId") + + @property + def ai_search_user_assigned_identity(self) -> SearchIndexerDataUserAssignedIdentity: + """This function returns the ai search user assigned identity. + + Returns: + SearchIndexerDataUserAssignedIdentity: The ai search user assigned identity""" + user_assigned_identity = SearchIndexerDataUserAssignedIdentity( + user_assigned_identity=os.environ.get("AIService__AzureSearchOptions__Identity__FQName") + ) + return user_assigned_identity @property def ai_search_credential(self) -> DefaultAzureCredential | AzureKeyCredential: @@ -79,9 +100,9 @@ def ai_search_credential(self) -> DefaultAzureCredential | AzureKeyCredential: if self.identity_type in IdentityType.SYSTEM_ASSIGNED: return DefaultAzureCredential() elif self.identity_type in IdentityType.USER_ASSIGNED: - return DefaultAzureCredential(managed_identity_client_id =os.environ.get("AIService__AzureSearchOptions__ManagedIdentity__FQName")) + return DefaultAzureCredential(managed_identity_client_id=self.ai_search_identity_id) else: - return AzureKeyCredential(os.environ.get("AIService__AzureSearchOptions__Key__Secret")) + return AzureKeyCredential(os.environ.get("AIService__AzureSearchOptions__Key")) @property def storage_account_connection_string(self) -> str: @@ -125,15 +146,35 @@ def function_app_adi_route(self) -> str: """ This function returns function app adi name """ - return os.environ.get("FunctionApp__DocumentIntelligence__FunctionName") + return os.environ.get("FunctionApp__ADI__FunctionName") @property def function_app_key_phrase_extractor_route(self) -> str: """ This function returns function app keyphrase extractor name """ - return os.environ.get("FunctionApp__KeyphraseExtractor__FunctionName") + return os.environ.get("FunctionApp__KeyPhraseExtractor__FunctionName") + + @property + def ai_search_embedding_model_dimensions(self) -> str: + """ + This function returns dimensions for embedding model. + + Returns: + str: The dimensions for embedding model + """ + + return os.environ.get( + f"AIService__AzureSearchOptions__{self.normalised_indexer_type}__EmbeddingDimensions" + ) + @property + def use_private_endpoint(self) -> bool: + """ + This function returns true if private endpoint is used + """ + return os.environ.get("AIService__AzureSearchOptions__UsePrivateEndpoint") == "true" + def get_custom_skill_function_url(self, skill_type: str): """ Get the function app url that is hosting the custom skill @@ -152,81 +193,9 @@ def get_custom_skill_function_url(self, skill_type: str): return full_url - -# managed identity id -def get_managed_identity_id() -> str: - """ - This function returns maanged identity id - """ - return os.environ.get("AIService__AzureSearchOptions__ManagedIdentity__ClientId") - - -def get_managed_identity_fqname() -> str: - """ - This function returns maanged identity name - """ - return os.environ.get("AIService__AzureSearchOptions__ManagedIdentity__FQName") - - # function app details def get_function_app_authresourceid() -> str: """ This function returns apps registration in microsoft entra id """ - return os.environ.get("FunctionApp__AuthResourceId") - -# search -def get_search_endpoint() -> str: - """ - This function returns azure ai search service endpoint - """ - return os.environ.get("AIService__AzureSearchOptions__Endpoint") - - -def get_search_user_assigned_identity() -> str: - """ - This function returns azure ai search service endpoint - """ - return os.environ.get("AIService__AzureSearchOptions__UserAssignedIdentity") - - -def get_search_key(client) -> str: - """ - This function returns azure ai search service admin key - """ - search_service_key_secret_name = ( - str(os.environ.get("AIService__AzureSearchOptions__name")) + "-PrimaryKey" - ) - retrieved_secret = client.get_secret(search_service_key_secret_name) - return retrieved_secret.value - - -def get_search_key_secret() -> str: - """ - This function returns azure ai search service admin key - """ - return os.environ.get("AIService__AzureSearchOptions__Key__Secret") - - -def get_search_embedding_model_dimensions(indexer_type: IndexerType) -> str: - """ - This function returns dimensions for embedding model - """ - - normalised_indexer_type = ( - indexer_type.value.replace("-", " ").title().replace(" ", "") - ) - - return os.environ.get( - f"AIService__AzureSearchOptions__{normalised_indexer_type}__EmbeddingDimensions" - ) - - -def get_blob_container_name(indexer_type: str) -> str: - """ - This function returns azure blob container name - """ - normalised_indexer_type = ( - indexer_type.value.replace("-", " ").title().replace(" ", "") - ) - return os.environ.get(f"StorageAccount__{normalised_indexer_type}__Container") + return os.environ.get("FunctionApp__AuthResourceId") \ No newline at end of file diff --git a/ai_search_with_adi/ai_search/rag_documents.py b/ai_search_with_adi/ai_search/rag_documents.py index 8541478..cf581d4 100644 --- a/ai_search_with_adi/ai_search/rag_documents.py +++ b/ai_search_with_adi/ai_search/rag_documents.py @@ -83,7 +83,7 @@ def get_index_fields(self) -> list[SearchableField]: SearchField( name="ChunkEmbedding", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), - vector_search_dimensions=self.environment.embedding_model_dimensions, + vector_search_dimensions=self.environment.ai_search_embedding_model_dimensions, vector_search_profile_name=self.vector_search_profile_name, ), SearchableField( From a06909d179114ab1b419ce82a73b786302910b01 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Tue, 10 Sep 2024 14:59:14 +0100 Subject: [PATCH 11/33] Refactor envs --- ai_search_with_adi/ai_search/ai_search.py | 16 ++++++++-------- ai_search_with_adi/ai_search/environment.py | 17 ++++++++--------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/ai_search_with_adi/ai_search/ai_search.py b/ai_search_with_adi/ai_search/ai_search.py index 1738d46..126ec3e 100644 --- a/ai_search_with_adi/ai_search/ai_search.py +++ b/ai_search_with_adi/ai_search/ai_search.py @@ -166,8 +166,8 @@ def get_data_source(self) -> SearchIndexerDataSourceConnection: data_deletion_detection_policy=data_deletion_detection_policy, ) - # if self.environment.identity_type != IdentityType.KEY: - # data_source_connection.identity = self.environment.ai_search_identity_id + if self.environment.identity_type != IdentityType.KEY: + data_source_connection.identity = self.environment.ai_search_identity_id return data_source_connection @@ -227,11 +227,11 @@ def get_pre_embedding_cleaner_skill( if self.environment.identity_type != IdentityType.KEY: pre_embedding_cleaner_skill.auth_identity = ( - self.environment.ai_search_identity_id + self.environment.function_app_app_registration_resource_id ) if self.environment.identity_type == IdentityType.USER_ASSIGNED: - pre_embedding_cleaner_skill.auth_resource_id = ( + pre_embedding_cleaner_skill.auth_identity = ( self.environment.ai_search_user_assigned_identity ) @@ -307,10 +307,10 @@ def get_adi_skill(self, chunk_by_page=False) -> WebApiSkill: ) if self.environment.identity_type != IdentityType.KEY: - adi_skill.auth_identity = self.environment.ai_search_identity_id + adi_skill.auth_identity = self.environment.function_app_app_registration_resource_id if self.environment.identity_type == IdentityType.USER_ASSIGNED: - adi_skill.auth_resource_id = ( + adi_skill.auth_identity = ( self.environment.ai_search_user_assigned_identity ) @@ -383,11 +383,11 @@ def get_key_phrase_extraction_skill(self, context, source) -> WebApiSkill: if self.environment.identity_type != IdentityType.KEY: key_phrase_extraction_skill.auth_identity = ( - self.environment.ai_search_identity_id + self.environment.function_app_app_registration_resource_id ) if self.environment.identity_type == IdentityType.USER_ASSIGNED: - key_phrase_extraction_skill.auth_resource_id = ( + key_phrase_extraction_skill.auth_identity = ( self.environment.ai_search_user_assigned_identity ) diff --git a/ai_search_with_adi/ai_search/environment.py b/ai_search_with_adi/ai_search/environment.py index e4e407b..b406cac 100644 --- a/ai_search_with_adi/ai_search/environment.py +++ b/ai_search_with_adi/ai_search/environment.py @@ -134,6 +134,13 @@ def function_app_key(self) -> str: """ return os.environ.get("FunctionApp__Key") + @property + def function_app_app_registration_resource_id(self) -> str: + """ + This function returns function app app registration resource id + """ + return os.environ.get("FunctionApp__AppRegistrationResourceId") + @property def function_app_pre_embedding_cleaner_route(self) -> str: """ @@ -190,12 +197,4 @@ def get_custom_skill_function_url(self, skill_type: str): full_url = f"{self.function_app_end_point}/api/{route}?code={self.function_app_key}" - return full_url - - -# function app details -def get_function_app_authresourceid() -> str: - """ - This function returns apps registration in microsoft entra id - """ - return os.environ.get("FunctionApp__AuthResourceId") \ No newline at end of file + return full_url \ No newline at end of file From 97f32a6c028da443e60a2320e282b673058b51d1 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Tue, 10 Sep 2024 15:32:40 +0100 Subject: [PATCH 12/33] Add openai setting --- ai_search_with_adi/ai_search/.env | 20 ++++ ai_search_with_adi/ai_search/ai_search.py | 35 ++++-- ai_search_with_adi/ai_search/environment.py | 111 +++++++++++++----- ai_search_with_adi/ai_search/rag_documents.py | 2 +- .../indexer/pre_embedding_cleaner.py | 44 +++---- 5 files changed, 155 insertions(+), 57 deletions(-) create mode 100644 ai_search_with_adi/ai_search/.env diff --git a/ai_search_with_adi/ai_search/.env b/ai_search_with_adi/ai_search/.env new file mode 100644 index 0000000..77c0372 --- /dev/null +++ b/ai_search_with_adi/ai_search/.env @@ -0,0 +1,20 @@ +FunctionApp__Endpoint= +FunctionApp__Key= +FunctionApp__PreEmbeddingCleaner__FunctionName=pre_embedding_cleaner +FunctionApp__ADI__FunctionName=adi_2_ai_search +FunctionApp__KeyPhraseExtractor__FunctionName=keyphrase_extractor +FunctionApp__AppRegistrationResourceId= +AIService__AzureSearchOptions__IdentityType= # system_assigned or user_assigned or key +AIService__AzureSearchOptions__Endpoint= +AIService__AzureSearchOptions__Identity__ClientId= +AIService__AzureSearchOptions__Key= +AIService__AzureSearchOptions__UsePrivateEndpoint= +AIService__AzureSearchOptions__Identity__FQName= +StorageAccount__FQEndpoint= +StorageAccount__ConnectionString= +StorageAccount__RagDocuments__Container= +OpenAI__ApiKey= +OpenAI__Endpoint= +OpenAI__EmbeddingModel= +OpenAI__EmbeddingDeployment= +OpenAI__EmbeddingDimensions=1536 diff --git a/ai_search_with_adi/ai_search/ai_search.py b/ai_search_with_adi/ai_search/ai_search.py index 126ec3e..f392400 100644 --- a/ai_search_with_adi/ai_search/ai_search.py +++ b/ai_search_with_adi/ai_search/ai_search.py @@ -307,13 +307,13 @@ def get_adi_skill(self, chunk_by_page=False) -> WebApiSkill: ) if self.environment.identity_type != IdentityType.KEY: - adi_skill.auth_identity = self.environment.function_app_app_registration_resource_id - - if self.environment.identity_type == IdentityType.USER_ASSIGNED: adi_skill.auth_identity = ( - self.environment.ai_search_user_assigned_identity + self.environment.function_app_app_registration_resource_id ) + if self.environment.identity_type == IdentityType.USER_ASSIGNED: + adi_skill.auth_identity = self.environment.ai_search_user_assigned_identity + return adi_skill def get_vector_skill( @@ -335,12 +335,20 @@ def get_vector_skill( name="Vector Skill", description="Skill to generate embeddings", context=context, - deployment_id="0", - model_name="text-embedding-3-large", + deployment_id=self.environment.open_ai_embedding_deployment, + model_name=self.environment.open_ai_embedding_model, inputs=embedding_skill_inputs, outputs=embedding_skill_outputs, + dimensions=self.environment.open_ai_embedding_dimensions, ) + if self.environment.identity_type == IdentityType.KEY: + vector_skill.api_key = self.environment.open_ai_api_key + elif self.environment.identity_type == IdentityType.USER_ASSIGNED: + vector_skill.auth_identity = ( + self.environment.ai_search_user_assigned_identity + ) + return vector_skill def get_key_phrase_extraction_skill(self, context, source) -> WebApiSkill: @@ -403,6 +411,19 @@ def get_vector_search(self) -> VectorSearch: VectorSearch: The vector search configuration """ + open_ai_params = AzureOpenAIParameters( + resource_uri=self.environment.open_ai_endpoint, + modelName=self.environment.open_ai_embedding_model, + deploymentId=self.environment.open_ai_embedding_deployment, + ) + + if self.environment.identity_type == IdentityType.KEY: + open_ai_params.api_key = self.environment.open_ai_api_key + elif self.environment.identity_type == IdentityType.USER_ASSIGNED: + open_ai_params.auth_identity = ( + self.environment.ai_search_user_assigned_identity + ) + vector_search = VectorSearch( algorithms=[ HnswAlgorithmConfiguration(name=self.algorithm_name), @@ -417,7 +438,7 @@ def get_vector_search(self) -> VectorSearch: vectorizers=[ AzureOpenAIVectorizer( name=self.vectorizer_name, - azure_open_ai_parameters=AzureOpenAIParameters(), + azure_open_ai_parameters=open_ai_params, ), ], ) diff --git a/ai_search_with_adi/ai_search/environment.py b/ai_search_with_adi/ai_search/environment.py index b406cac..9729387 100644 --- a/ai_search_with_adi/ai_search/environment.py +++ b/ai_search_with_adi/ai_search/environment.py @@ -7,11 +7,13 @@ from azure.core.credentials import AzureKeyCredential from azure.search.documents.indexes.models import SearchIndexerDataUserAssignedIdentity + class IndexerType(Enum): """The type of the indexer""" RAG_DOCUMENTS = "rag-documents" + class IdentityType(Enum): """The type of the indexer""" @@ -19,8 +21,10 @@ class IdentityType(Enum): SYSTEM_ASSIGNED = "system_assigned" KEY = "key" + class AISearchEnvironment: """This class is used to get the environment variables for the AI search service.""" + def __init__(self, indexer_type: IndexerType): """Initialize the AISearchEnvironment class. @@ -33,7 +37,7 @@ def __init__(self, indexer_type: IndexerType): @property def normalised_indexer_type(self) -> str: """This function returns the normalised indexer type. - + Returns: str: The normalised indexer type """ @@ -46,7 +50,7 @@ def normalised_indexer_type(self) -> str: @property def identity_type(self) -> IdentityType: """This function returns the identity type. - + Returns: IdentityType: The identity type """ @@ -60,54 +64,100 @@ def identity_type(self) -> IdentityType: return IdentityType.KEY else: raise ValueError("Invalid identity type") - + @property def ai_search_endpoint(self) -> str: """This function returns the ai search endpoint. - + Returns: str: The ai search endpoint """ return os.environ.get("AIService__AzureSearchOptions__Endpoint") - + @property def ai_search_identity_id(self) -> str: """This function returns the ai search identity id. - + Returns: str: The ai search identity id """ return os.environ.get("AIService__AzureSearchOptions__Identity__ClientId") - + @property def ai_search_user_assigned_identity(self) -> SearchIndexerDataUserAssignedIdentity: """This function returns the ai search user assigned identity. - + Returns: - SearchIndexerDataUserAssignedIdentity: The ai search user assigned identity""" + SearchIndexerDataUserAssignedIdentity: The ai search user assigned identity + """ user_assigned_identity = SearchIndexerDataUserAssignedIdentity( - user_assigned_identity=os.environ.get("AIService__AzureSearchOptions__Identity__FQName") + user_assigned_identity=os.environ.get( + "AIService__AzureSearchOptions__Identity__FQName" + ) ) return user_assigned_identity @property def ai_search_credential(self) -> DefaultAzureCredential | AzureKeyCredential: """This function returns the ai search credential. - + Returns: DefaultAzureCredential | AzureKeyCredential: The ai search credential """ if self.identity_type in IdentityType.SYSTEM_ASSIGNED: return DefaultAzureCredential() elif self.identity_type in IdentityType.USER_ASSIGNED: - return DefaultAzureCredential(managed_identity_client_id=self.ai_search_identity_id) + return DefaultAzureCredential( + managed_identity_client_id=self.ai_search_identity_id + ) else: - return AzureKeyCredential(os.environ.get("AIService__AzureSearchOptions__Key")) + return AzureKeyCredential( + os.environ.get("AIService__AzureSearchOptions__Key") + ) + + @property + def open_ai_api_key(self) -> str: + """This function returns the open ai api key. + + Returns: + str: The open ai api key + """ + return os.environ.get("OpenAI__ApiKey") + + @property + def open_ai_endpoint(self) -> str: + """This function returns the open ai endpoint. + + Returns: + str: The open ai endpoint + """ + return os.environ.get("OpenAI__Endpoint") + + @property + def open_ai_embedding_model(self) -> str: + """This function returns the open ai embedding model. + + Returns: + str: The open ai embedding model + """ + return os.environ.get("OpenAI__EmbeddingModel") + + @property + def open_ai_embedding_deployment(self) -> str: + """This function returns the open ai embedding deployment. + + Returns: + str: The open ai embedding deployment + """ + return os.environ.get("OpenAI__EmbeddingDeployment") @property def storage_account_connection_string(self) -> str: """This function returns the blob connection string. If the identity type is user_assigned or system_assigned, it returns the FQEndpoint, otherwise it returns the ConnectionString""" - if self.identity_type in [IdentityType.SYSTEM_ASSIGNED, IdentityType.USER_ASSIGNED]: + if self.identity_type in [ + IdentityType.SYSTEM_ASSIGNED, + IdentityType.USER_ASSIGNED, + ]: return os.environ.get("StorageAccount__FQEndpoint") else: return os.environ.get("StorageAccount__ConnectionString") @@ -118,8 +168,10 @@ def storage_account_blob_container_name(self) -> str: This function returns azure blob container name """ - return os.environ.get(f"StorageAccount__{self.normalised_indexer_type}__Container") - + return os.environ.get( + f"StorageAccount__{self.normalised_indexer_type}__Container" + ) + @property def function_app_end_point(self) -> str: """ @@ -133,14 +185,14 @@ def function_app_key(self) -> str: This function returns function app key """ return os.environ.get("FunctionApp__Key") - + @property def function_app_app_registration_resource_id(self) -> str: """ This function returns function app app registration resource id """ return os.environ.get("FunctionApp__AppRegistrationResourceId") - + @property def function_app_pre_embedding_cleaner_route(self) -> str: """ @@ -161,9 +213,9 @@ def function_app_key_phrase_extractor_route(self) -> str: This function returns function app keyphrase extractor name """ return os.environ.get("FunctionApp__KeyPhraseExtractor__FunctionName") - + @property - def ai_search_embedding_model_dimensions(self) -> str: + def open_ai_embedding_dimensions(self) -> str: """ This function returns dimensions for embedding model. @@ -171,16 +223,17 @@ def ai_search_embedding_model_dimensions(self) -> str: str: The dimensions for embedding model """ - return os.environ.get( - f"AIService__AzureSearchOptions__{self.normalised_indexer_type}__EmbeddingDimensions" - ) - + return os.environ.get("OpenAI__EmbeddingDimensions") + @property def use_private_endpoint(self) -> bool: """ This function returns true if private endpoint is used """ - return os.environ.get("AIService__AzureSearchOptions__UsePrivateEndpoint") == "true" + return ( + os.environ.get("AIService__AzureSearchOptions__UsePrivateEndpoint") + == "true" + ) def get_custom_skill_function_url(self, skill_type: str): """ @@ -194,7 +247,9 @@ def get_custom_skill_function_url(self, skill_type: str): route = self.function_app_key_phrase_extractor_route else: raise ValueError(f"Invalid skill type: {skill_type}") - - full_url = f"{self.function_app_end_point}/api/{route}?code={self.function_app_key}" - return full_url \ No newline at end of file + full_url = ( + f"{self.function_app_end_point}/api/{route}?code={self.function_app_key}" + ) + + return full_url diff --git a/ai_search_with_adi/ai_search/rag_documents.py b/ai_search_with_adi/ai_search/rag_documents.py index cf581d4..7c3184f 100644 --- a/ai_search_with_adi/ai_search/rag_documents.py +++ b/ai_search_with_adi/ai_search/rag_documents.py @@ -83,7 +83,7 @@ def get_index_fields(self) -> list[SearchableField]: SearchField( name="ChunkEmbedding", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), - vector_search_dimensions=self.environment.ai_search_embedding_model_dimensions, + vector_search_dimensions=self.environment.open_ai_embedding_dimensions, vector_search_profile_name=self.vector_search_profile_name, ), SearchableField( diff --git a/ai_search_with_adi/function_apps/indexer/pre_embedding_cleaner.py b/ai_search_with_adi/function_apps/indexer/pre_embedding_cleaner.py index 79cbaae..b3303d6 100644 --- a/ai_search_with_adi/function_apps/indexer/pre_embedding_cleaner.py +++ b/ai_search_with_adi/function_apps/indexer/pre_embedding_cleaner.py @@ -1,9 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. - import logging import json -import string import nltk import re from nltk.tokenize import word_tokenize @@ -11,35 +9,32 @@ nltk.download("punkt") nltk.download("stopwords") -import re - -# Configure logging -logging.basicConfig(level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s') -def get_section(cleaned_text:str) -> list: +def get_section(cleaned_text: str) -> list: """ Returns the section details from the content Args: cleaned_text: The input text - + Returns: list: The sections related to text - + """ - combined_pattern = r'(.*?)\n===|\n## ?(.*?)\n|\n### ?(.*?)\n' + combined_pattern = r"(.*?)\n===|\n## ?(.*?)\n|\n### ?(.*?)\n" doc_metadata = re.findall(combined_pattern, cleaned_text, re.DOTALL) doc_metadata = [match for group in doc_metadata for match in group if match] return doc_metadata -def remove_markdown_tags(text:str, tag_patterns:dict) ->str: + +def remove_markdown_tags(text: str, tag_patterns: dict) -> str: """ Remove specified Markdown tags from the text, keeping the contents of the tags. - + Args: text: The input text containing Markdown tags. tag_patterns: A dictionary where keys are tags and values are their specific patterns. - + Returns: str: The text with specified tags removed. """ @@ -47,13 +42,14 @@ def remove_markdown_tags(text:str, tag_patterns:dict) ->str: for tag, pattern in tag_patterns.items(): try: # Replace the tags using the specific pattern, keeping the content inside the tags - text = re.sub(pattern, r'\1', text, flags=re.DOTALL) + text = re.sub(pattern, r"\1", text, flags=re.DOTALL) except re.error as e: - logging.error(f"Regex error for tag '{tag}': {e}") + logging.error(f"Regex error for tag '{tag}': {e}") except Exception as e: logging.error(f"An error occurred in remove_markdown_tags: {e}") return text + def clean_text(src_text: str) -> str: """This function performs following cleanup activities on the text, remove all unicode characters remove line spacing,remove stop words, normalize characters @@ -63,7 +59,7 @@ def clean_text(src_text: str) -> str: Returns: str: The clean text.""" - + try: # Define specific patterns for each tag tag_patterns = { @@ -119,16 +115,22 @@ async def process_pre_embedding_cleaner(record: dict) -> dict: } # scenarios when page by chunking is enabled - if isinstance(record["data"]["chunk"],dict): - cleaned_record["data"]["cleaned_chunk"] = clean_text(record["data"]["chunk"]["content"]) + if isinstance(record["data"]["chunk"], dict): + cleaned_record["data"]["cleaned_chunk"] = clean_text( + record["data"]["chunk"]["content"] + ) cleaned_record["data"]["chunk"] = record["data"]["chunk"]["content"] cleaned_record["data"]["section"] = record["data"]["chunk"]["section"] - cleaned_record["data"]["page_number"] = record["data"]["chunk"]["page_number"] + cleaned_record["data"]["page_number"] = record["data"]["chunk"][ + "page_number" + ] else: - cleaned_record["data"]["cleaned_chunk"] = clean_text(record["data"]["chunk"]) + cleaned_record["data"]["cleaned_chunk"] = clean_text( + record["data"]["chunk"] + ) cleaned_record["data"]["chunk"] = record["data"]["chunk"] cleaned_record["data"]["section"] = get_section(record["data"]["chunk"]) - + except Exception as e: logging.error("string cleanup Error: %s", e) return { From 424e090df6733ea78f5e56053a11443f75eac788 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Tue, 10 Sep 2024 15:34:23 +0100 Subject: [PATCH 13/33] Update route --- ai_search_with_adi/ai_search/.env | 2 +- ai_search_with_adi/function_apps/indexer/function_app.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ai_search_with_adi/ai_search/.env b/ai_search_with_adi/ai_search/.env index 77c0372..f7efa45 100644 --- a/ai_search_with_adi/ai_search/.env +++ b/ai_search_with_adi/ai_search/.env @@ -2,7 +2,7 @@ FunctionApp__Endpoint= FunctionApp__Key= FunctionApp__PreEmbeddingCleaner__FunctionName=pre_embedding_cleaner FunctionApp__ADI__FunctionName=adi_2_ai_search -FunctionApp__KeyPhraseExtractor__FunctionName=keyphrase_extractor +FunctionApp__KeyPhraseExtractor__FunctionName=key_phrase_extractor FunctionApp__AppRegistrationResourceId= AIService__AzureSearchOptions__IdentityType= # system_assigned or user_assigned or key AIService__AzureSearchOptions__Endpoint= diff --git a/ai_search_with_adi/function_apps/indexer/function_app.py b/ai_search_with_adi/function_apps/indexer/function_app.py index 6057ec7..eacb1e3 100644 --- a/ai_search_with_adi/function_apps/indexer/function_app.py +++ b/ai_search_with_adi/function_apps/indexer/function_app.py @@ -169,8 +169,8 @@ async def pre_embedding_cleaner(req: func.HttpRequest) -> func.HttpResponse: ) -@app.route(route="keyphrase_extractor", methods=[func.HttpMethod.POST]) -async def keyphrase_extractor(req: func.HttpRequest) -> func.HttpResponse: +@app.route(route="key_phrase_extractor", methods=[func.HttpMethod.POST]) +async def key_phrase_extractor(req: func.HttpRequest) -> func.HttpResponse: """HTTP trigger for data cleanup function. Args: From fc0868933c8a33e422d09b848ceabfa0a381c838 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Tue, 10 Sep 2024 16:20:52 +0100 Subject: [PATCH 14/33] Fix adi bugs --- ai_search_with_adi/README.md | 6 +- ai_search_with_adi/ai_search/README.md | 18 ++ .../adi_2_ai_search.py | 181 ++++++++++-------- .../common/ai_search.py | 0 .../common/delay_processing_exception.py | 0 .../common/payloads/error.py | 0 .../common/payloads/header.py | 1 + .../common/payloads/payload.py | 0 .../payloads/pending_index_completion.py | 0 .../common/payloads/pending_index_trigger.py | 4 +- .../common/requirements.txt | 0 .../common/service_bus.py | 0 .../indexer => function_app}/function_app.py | 0 .../key_phrase_extraction.py | 52 +++-- .../indexer => function_app}/ocr.py | 0 .../pending_index_completion.py | 2 +- .../pending_index_trigger.py | 0 .../pre_embedding_cleaner.py | 0 .../indexer => function_app}/requirements.txt | 6 - .../storage_account.py | 0 .../indexer => function_app}/text_split.py | 0 21 files changed, 164 insertions(+), 106 deletions(-) create mode 100644 ai_search_with_adi/ai_search/README.md rename ai_search_with_adi/{function_apps/indexer => function_app}/adi_2_ai_search.py (74%) rename ai_search_with_adi/{function_apps => function_app}/common/ai_search.py (100%) rename ai_search_with_adi/{function_apps => function_app}/common/delay_processing_exception.py (100%) rename ai_search_with_adi/{function_apps => function_app}/common/payloads/error.py (100%) rename ai_search_with_adi/{function_apps => function_app}/common/payloads/header.py (99%) rename ai_search_with_adi/{function_apps => function_app}/common/payloads/payload.py (100%) rename ai_search_with_adi/{function_apps => function_app}/common/payloads/pending_index_completion.py (100%) rename ai_search_with_adi/{function_apps => function_app}/common/payloads/pending_index_trigger.py (91%) rename ai_search_with_adi/{function_apps => function_app}/common/requirements.txt (100%) rename ai_search_with_adi/{function_apps => function_app}/common/service_bus.py (100%) rename ai_search_with_adi/{function_apps/indexer => function_app}/function_app.py (100%) rename ai_search_with_adi/{function_apps/indexer => function_app}/key_phrase_extraction.py (70%) rename ai_search_with_adi/{function_apps/indexer => function_app}/ocr.py (100%) rename ai_search_with_adi/{function_apps/indexer => function_app}/pending_index_completion.py (98%) rename ai_search_with_adi/{function_apps/indexer => function_app}/pending_index_trigger.py (100%) rename ai_search_with_adi/{function_apps/indexer => function_app}/pre_embedding_cleaner.py (100%) rename ai_search_with_adi/{function_apps/indexer => function_app}/requirements.txt (70%) rename ai_search_with_adi/{function_apps/common => function_app}/storage_account.py (100%) rename ai_search_with_adi/{function_apps/indexer => function_app}/text_split.py (100%) diff --git a/ai_search_with_adi/README.md b/ai_search_with_adi/README.md index d43d1e7..6165ca4 100644 --- a/ai_search_with_adi/README.md +++ b/ai_search_with_adi/README.md @@ -38,10 +38,14 @@ The properties returned from the ADI Custom Skill are then used to perform the f ## Provided Notebooks \& Utilities -- `./ai_search.py`, `./deployment.py` provide an easy Python based utility for deploying an index, indexer and corresponding skillset for AI Search. +- `./ai_search.py`, `./deploy.py` provide an easy Python based utility for deploying an index, indexer and corresponding skillset for AI Search. - `./function_apps/indexer` provides a pre-built Python function app that communicates with Azure Document Intelligence, Azure OpenAI etc to perform the Markdown conversion, extraction of figures, figure understanding and corresponding cleaning of Markdown. - `./rag_with_ai_search.ipynb` provides example of how to utilise the AI Search plugin to query the index. +## Deploying AI Search Setup + +To deploy the pre-built index and associated indexer / skillset setup, see instructions in `./ai_search/README.md`. + ## ADI Custom Skill Deploy the associated function app and required resources. You can then experiment with the custom skill by sending an HTTP request in the AI Search JSON format to the `/adi_2_ai_search` HTTP endpoint. diff --git a/ai_search_with_adi/ai_search/README.md b/ai_search_with_adi/ai_search/README.md new file mode 100644 index 0000000..c124cd5 --- /dev/null +++ b/ai_search_with_adi/ai_search/README.md @@ -0,0 +1,18 @@ +# AI Search Indexing with Azure Document Intelligence - Pre-built Index Setup + +The associated scripts in this portion of the repository contains pre-built scripts to deploy the skillset with Azure Document Intelligence. + +## Steps + +1. Update `.env` file with the associated values. Not all values are required dependent on whether you are using System / User Assigned Identities or a Key based authentication. +2. Adjust `rag_documents.py` with any changes to the index / indexer. The `get_skills()` method implements the skills pipeline. Make any adjustments here in the skills needed to enrich the data source. +3. Run `deploy.py` with the following args: + + - `indexer_type rag`. This selects the `rag_documents` sub class. + - `enable_page_chunking True`. This determines whether page wise chunking is applied in ADI, or whether the inbuilt skill is used for TextSplit. **Page wise analysis in ADI is recommended to avoid splitting tables / figures across multiple chunks, when the chunking is performed.** + - `rebuild`. Whether to delete and rebuild the index. + - `suffix`. Optional parameter that will apply a suffix onto the deployed index and indexer. This is useful if you want deploy a test version, before overwriting the main version. + +## ai_search.py & environment.py + +This includes a variety of helper files and scripts to deploy the index setup. This is useful for CI/CD to avoid having to write JSON files manually or use the UI to deploy the pipeline. diff --git a/ai_search_with_adi/function_apps/indexer/adi_2_ai_search.py b/ai_search_with_adi/function_app/adi_2_ai_search.py similarity index 74% rename from ai_search_with_adi/function_apps/indexer/adi_2_ai_search.py rename to ai_search_with_adi/function_app/adi_2_ai_search.py index ae0474a..0bf695e 100644 --- a/ai_search_with_adi/function_apps/indexer/adi_2_ai_search.py +++ b/ai_search_with_adi/function_app/adi_2_ai_search.py @@ -12,10 +12,11 @@ from PIL import Image import io import logging -from common.storage_account import StorageAccountHelper +from storage_account import StorageAccountHelper import concurrent.futures import json -from openai import AzureOpenAI +from openai import AsyncAzureOpenAI +import openai def crop_image_from_pdf_page(pdf_path, page_number, bounding_box): @@ -42,7 +43,7 @@ def crop_image_from_pdf_page(pdf_path, page_number, bounding_box): def clean_adi_markdown( - markdown_text: str, page_no: int, remove_irrelevant_figures=False + markdown_text: str, page_no: int = None, remove_irrelevant_figures=False ): """Clean Markdown text extracted by the Azure Document Intelligence service. @@ -56,21 +57,6 @@ def clean_adi_markdown( str: The cleaned Markdown text. """ - # # Remove the page number comment - # page_number_pattern = r"" - # cleaned_text = re.sub(page_number_pattern, "", markdown_text) - - # # Replace the page header comment with its content - # page_header_pattern = r"" - # cleaned_text = re.sub( - # page_header_pattern, lambda match: match.group(1), cleaned_text - # ) - - # # Replace the page footer comment with its content - # page_footer_pattern = r"" - # cleaned_text = re.sub( - # page_footer_pattern, lambda match: match.group(1), cleaned_text - # ) output_dict = {} comment_patterns = r"||" cleaned_text = re.sub(comment_patterns, "", markdown_text, flags=re.DOTALL) @@ -94,7 +80,7 @@ def clean_adi_markdown( output_dict["sections"] = doc_metadata # add page number when chunk by page is enabled - if page_no > -1: + if page_no is not None: output_dict["page_number"] = page_no return output_dict @@ -135,7 +121,7 @@ def update_figure_description(md_content, img_description, idx): return new_md_content -async def understand_image_with_gptv(image_base64, caption): +async def understand_image_with_gptv(image_base64, caption, tries_left=3): """ Generates a description for an image using the GPT-4V model. @@ -153,57 +139,81 @@ async def understand_image_with_gptv(image_base64, caption): deployment_name = os.environ["AzureAI__GPT4V_Deployment"] api_base = os.environ["AzureAI__GPT4V_APIbase"] - client = AzureOpenAI( - api_key=api_key, - api_version=api_version, - base_url=f"{api_base}/openai/deployments/{deployment_name}", - ) - - # We send both image caption and the image body to GPTv for better understanding - if caption != "": - response = client.chat.completions.create( - model=deployment_name, - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - { - "role": "user", - "content": [ - { - "type": "text", - "text": f"Describe this image (note: it has image caption: {caption}):", - }, + try: + async with AsyncAzureOpenAI( + api_key=api_key, + api_version=api_version, + base_url=f"{api_base}/openai/deployments/{deployment_name}", + ) as client: + # We send both image caption and the image body to GPTv for better understanding + if caption != "": + response = await client.chat.completions.create( + model=deployment_name, + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, { - "type": "image_base64", - "image_base64": {"image": image_base64}, + "role": "user", + "content": [ + { + "type": "text", + "text": f"Describe this image with technical analysis. Provide a well-structured, description. IMPORTANT: If the provided image is a logo or photograph, simply return 'Irrelevant Image'. (note: it has image caption: {caption}):", + }, + { + "type": "image_base64", + "image_base64": {"image": image_base64}, + }, + ], }, ], - }, - ], - max_tokens=MAX_TOKENS, - ) + max_tokens=MAX_TOKENS, + ) - else: - response = client.chat.completions.create( - model=deployment_name, - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - { - "role": "user", - "content": [ - {"type": "text", "text": "Describe this image:"}, + else: + response = await client.chat.completions.create( + model=deployment_name, + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, { - "type": "image_base64", - "image_base64": {"image": image_base64}, + "role": "user", + "content": [ + { + "type": "text", + "text": "Describe this image with technical analysis. Provide a well-structured, description. IMPORTANT: If the provided image is a logo or photograph, simply return 'Irrelevant Image'.", + }, + { + "type": "image_base64", + "image_base64": {"image": image_base64}, + }, + ], }, ], - }, - ], - max_tokens=MAX_TOKENS, - ) + max_tokens=MAX_TOKENS, + ) + + img_description = response.choices[0].message.content - img_description = response.choices[0].message.content + logging.info(f"Image Description: {img_description}") - return img_description + return img_description + except openai.RateLimitError as e: + logging.error("OpenAI Rate Limit Error: %s", e) + + if tries_left > 0: + logging.info( + "Retrying understanding of image with %s tries left.", tries_left + ) + remaining_tries = tries_left - 1 + backoff = 20 ** (3 - remaining_tries) + await asyncio.sleep(backoff) + return await understand_image_with_gptv( + image_base64, caption, tries_left=remaining_tries + ) + else: + raise Exception("OpenAI Rate Limit Error: No retries left.") from e + except (openai.OpenAIError, openai.APIConnectionError) as e: + logging.error("OpenAI Error: %s", e) + + raise Exception("OpenAI Rate Limit Error: No retries left.") from e def pil_image_to_base64(image, image_format="JPEG"): @@ -263,7 +273,9 @@ async def process_figures_from_extracted_content( image_base64 = pil_image_to_base64(cropped_image) - img_description += await understand_image_with_gptv(image_base64) + img_description += await understand_image_with_gptv( + image_base64, figure.caption.content + ) logging.info(f"\tDescription of figure {idx}: {img_description}") markdown_content = update_figure_description( @@ -287,13 +299,12 @@ def create_page_wise_content(result: AnalyzeResult) -> list: page_wise_content = [] page_numbers = [] - page_number = 0 - for page in result.pages: + + for page_number, page in enumerate(result.pages): page_content = result.content[ page.spans[0]["offset"] : page.spans[0]["offset"] + page.spans[0]["length"] ] page_wise_content.append(page_content) - page_number += 1 page_numbers.append(page_number) return page_wise_content, page_numbers @@ -311,7 +322,6 @@ async def analyse_document(file_path: str) -> AnalyzeResult: AnalyzeResult: The result of the document analysis.""" with open(file_path, "rb") as f: file_read = f.read() - # base64_encoded_file = base64.b64encode(file_read).decode("utf-8") async with DocumentIntelligenceClient( endpoint=os.environ["AIService__Services__Endpoint"], @@ -335,6 +345,16 @@ async def analyse_document(file_path: str) -> AnalyzeResult: async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) -> dict: + """Process the extracted content from the Azure Document Intelligence service and prepare it for Azure Search. + + Args: + ----- + record (dict): The record containing the extracted content. + chunk_by_page (bool): Whether to chunk the content by page. + + Returns: + -------- + dict: The processed content ready for Azure Search.""" logging.info("Python HTTP trigger function processed a request.") storage_account_helper = StorageAccountHelper() @@ -431,20 +451,26 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) -> try: if chunk_by_page: cleaned_result = [] - markdown_content, page_no = create_page_wise_content(result) - tasks = [ + markdown_content, page_numbers = create_page_wise_content(result) + content_with_figures_tasks = [ process_figures_from_extracted_content( - temp_file_path, page_content, result.figures, page_number=idx + temp_file_path, + page_content, + result.figures, + page_number=page_number, ) - for idx, page_content in enumerate(markdown_content) + for page_content, page_number in zip(markdown_content, page_numbers) ] - content_with_figures = await asyncio.gather(*tasks) + content_with_figures = await asyncio.gather(*content_with_figures_tasks) + with concurrent.futures.ProcessPoolExecutor() as executor: futures = { executor.submit( - clean_adi_markdown, page_content, False + clean_adi_markdown, page_content, page_number, False ): page_content - for page_content in content_with_figures + for page_content, page_number in zip( + content_with_figures, page_numbers + ) } for future in concurrent.futures.as_completed(futures): cleaned_result.append(future.result()) @@ -455,7 +481,7 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) -> temp_file_path, markdown_content, result.figures ) cleaned_result = clean_adi_markdown( - content_with_figures, page_no=-1, remove_irrelevant_figures=False + content_with_figures, remove_irrelevant_figures=False ) except Exception as e: logging.error(e) @@ -483,7 +509,4 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) -> logging.info(f"final output: {json_str}") - return { - "recordId": record["recordId"], - "data": {"extracted_content": cleaned_result}, - } + return src diff --git a/ai_search_with_adi/function_apps/common/ai_search.py b/ai_search_with_adi/function_app/common/ai_search.py similarity index 100% rename from ai_search_with_adi/function_apps/common/ai_search.py rename to ai_search_with_adi/function_app/common/ai_search.py diff --git a/ai_search_with_adi/function_apps/common/delay_processing_exception.py b/ai_search_with_adi/function_app/common/delay_processing_exception.py similarity index 100% rename from ai_search_with_adi/function_apps/common/delay_processing_exception.py rename to ai_search_with_adi/function_app/common/delay_processing_exception.py diff --git a/ai_search_with_adi/function_apps/common/payloads/error.py b/ai_search_with_adi/function_app/common/payloads/error.py similarity index 100% rename from ai_search_with_adi/function_apps/common/payloads/error.py rename to ai_search_with_adi/function_app/common/payloads/error.py diff --git a/ai_search_with_adi/function_apps/common/payloads/header.py b/ai_search_with_adi/function_app/common/payloads/header.py similarity index 99% rename from ai_search_with_adi/function_apps/common/payloads/header.py rename to ai_search_with_adi/function_app/common/payloads/header.py index d90e684..c56a39b 100644 --- a/ai_search_with_adi/function_apps/common/payloads/header.py +++ b/ai_search_with_adi/function_app/common/payloads/header.py @@ -19,6 +19,7 @@ class TaskEnum(Enum): PENDING_INDEX_COMPLETION = "pending_index_completion" PENDING_INDEX_TRIGGER = "pending_index_trigger" + class Header(BaseModel): """Header model""" diff --git a/ai_search_with_adi/function_apps/common/payloads/payload.py b/ai_search_with_adi/function_app/common/payloads/payload.py similarity index 100% rename from ai_search_with_adi/function_apps/common/payloads/payload.py rename to ai_search_with_adi/function_app/common/payloads/payload.py diff --git a/ai_search_with_adi/function_apps/common/payloads/pending_index_completion.py b/ai_search_with_adi/function_app/common/payloads/pending_index_completion.py similarity index 100% rename from ai_search_with_adi/function_apps/common/payloads/pending_index_completion.py rename to ai_search_with_adi/function_app/common/payloads/pending_index_completion.py diff --git a/ai_search_with_adi/function_apps/common/payloads/pending_index_trigger.py b/ai_search_with_adi/function_app/common/payloads/pending_index_trigger.py similarity index 91% rename from ai_search_with_adi/function_apps/common/payloads/pending_index_trigger.py rename to ai_search_with_adi/function_app/common/payloads/pending_index_trigger.py index e4fd62b..199bf36 100644 --- a/ai_search_with_adi/function_apps/common/payloads/pending_index_trigger.py +++ b/ai_search_with_adi/function_app/common/payloads/pending_index_trigger.py @@ -20,7 +20,9 @@ class PendingIndexTriggerBody(BaseModel): id_name: Optional[str] = Field( None, description="The text name for the integer ID field" ) - additional_field: Optional[str] = Field(None, description="Description of additional_field") + additional_field: Optional[str] = Field( + None, description="Description of additional_field" + ) __config__ = ConfigDict(extra="ignore") diff --git a/ai_search_with_adi/function_apps/common/requirements.txt b/ai_search_with_adi/function_app/common/requirements.txt similarity index 100% rename from ai_search_with_adi/function_apps/common/requirements.txt rename to ai_search_with_adi/function_app/common/requirements.txt diff --git a/ai_search_with_adi/function_apps/common/service_bus.py b/ai_search_with_adi/function_app/common/service_bus.py similarity index 100% rename from ai_search_with_adi/function_apps/common/service_bus.py rename to ai_search_with_adi/function_app/common/service_bus.py diff --git a/ai_search_with_adi/function_apps/indexer/function_app.py b/ai_search_with_adi/function_app/function_app.py similarity index 100% rename from ai_search_with_adi/function_apps/indexer/function_app.py rename to ai_search_with_adi/function_app/function_app.py diff --git a/ai_search_with_adi/function_apps/indexer/key_phrase_extraction.py b/ai_search_with_adi/function_app/key_phrase_extraction.py similarity index 70% rename from ai_search_with_adi/function_apps/indexer/key_phrase_extraction.py rename to ai_search_with_adi/function_app/key_phrase_extraction.py index d8c023b..1de62bc 100644 --- a/ai_search_with_adi/function_apps/indexer/key_phrase_extraction.py +++ b/ai_search_with_adi/function_app/key_phrase_extraction.py @@ -11,54 +11,68 @@ MAX_TEXT_ELEMENTS = 5120 + def split_document(document, max_size): """Split a document into chunks of max_size.""" - return [document[i:i + max_size] for i in range(0, len(document), max_size)] + return [document[i : i + max_size] for i in range(0, len(document), max_size)] + -async def extract_key_phrases_from_text(data: list[str],max_key_phrase_count:int) -> list[str]: +async def extract_key_phrases_from_text( + data: list[str], max_key_phrase_count: int +) -> list[str]: logging.info("Python HTTP trigger function processed a request.") max_retries = 5 key_phrase_list = [] text_analytics_client = TextAnalyticsClient( - endpoint=os.environ["AIService__Services__Endpoint"], - credential=AzureKeyCredential(os.environ["AIService__Services__Key"]), - ) + endpoint=os.environ["AIService__Services__Endpoint"], + credential=AzureKeyCredential(os.environ["AIService__Services__Key"]), + ) try: async with text_analytics_client: - retries = 0 - while retries < max_retries: + retries = 0 + while retries < max_retries: try: - # Split large documents + # Split large documents split_documents = [] for doc in data: if len(doc) > MAX_TEXT_ELEMENTS: - split_documents.extend(split_document(doc, MAX_TEXT_ELEMENTS)) + split_documents.extend( + split_document(doc, MAX_TEXT_ELEMENTS) + ) else: split_documents.append(doc) - result = await text_analytics_client.extract_key_phrases(split_documents) - for idx,doc in enumerate(result): + result = await text_analytics_client.extract_key_phrases( + split_documents + ) + for idx, doc in enumerate(result): if not doc.is_error: - key_phrase_list.extend(doc.key_phrases[:max_key_phrase_count]) + key_phrase_list.extend( + doc.key_phrases[:max_key_phrase_count] + ) else: raise Exception(f"Document {idx} error: {doc.error}") break # Exit the loop if the request is successful except HttpResponseError as e: if e.status_code == 429: # Rate limiting error retries += 1 - wait_time = 2 ** retries # Exponential backoff - print(f"Rate limit exceeded. Retrying in {wait_time} seconds...") + wait_time = 2**retries # Exponential backoff + print( + f"Rate limit exceeded. Retrying in {wait_time} seconds..." + ) await asyncio.sleep(wait_time) else: raise Exception(f"An error occurred: {e}") except Exception as e: raise Exception(f"An error occurred: {e}") - + return key_phrase_list -async def process_key_phrase_extraction(record: dict,max_key_phrase_count:int =5 ) -> dict: +async def process_key_phrase_extraction( + record: dict, max_key_phrase_count: int = 5 +) -> dict: """Extract key phrases using azure ai services. Args: @@ -79,7 +93,7 @@ async def process_key_phrase_extraction(record: dict,max_key_phrase_count:int =5 "warnings": None, } extracted_record["data"]["keyPhrases"] = await extract_key_phrases_from_text( - [record["data"]["text"]],max_key_phrase_count + [record["data"]["text"]], max_key_phrase_count ) except Exception as e: logging.error("key phrase extraction Error: %s", e) @@ -93,7 +107,9 @@ async def process_key_phrase_extraction(record: dict,max_key_phrase_count:int =5 } extracted_record["data"][ "keyPhrases" - ] = await extract_key_phrases_from_text([record["data"]["text"]],max_key_phrase_count) + ] = await extract_key_phrases_from_text( + [record["data"]["text"]], max_key_phrase_count + ) except Exception as inner_e: logging.error("key phrase extraction Error: %s", inner_e) logging.error( diff --git a/ai_search_with_adi/function_apps/indexer/ocr.py b/ai_search_with_adi/function_app/ocr.py similarity index 100% rename from ai_search_with_adi/function_apps/indexer/ocr.py rename to ai_search_with_adi/function_app/ocr.py diff --git a/ai_search_with_adi/function_apps/indexer/pending_index_completion.py b/ai_search_with_adi/function_app/pending_index_completion.py similarity index 98% rename from ai_search_with_adi/function_apps/indexer/pending_index_completion.py rename to ai_search_with_adi/function_app/pending_index_completion.py index 3488488..4c4d1ba 100644 --- a/ai_search_with_adi/function_apps/indexer/pending_index_completion.py +++ b/ai_search_with_adi/function_app/pending_index_completion.py @@ -5,7 +5,7 @@ from common.service_bus import ServiceBusHelper from common.payloads.pending_index_completion import PendingIndexCompletionPayload from common.payloads.pending_index_trigger import PendingIndexTriggerPayload -from common.payloads.header import TaskEnum, DataTypeEnum +from common.payloads.header import TaskEnum from common.payloads.error import Error from datetime import datetime, timedelta, timezone from common.delay_processing_exception import DelayProcessingException diff --git a/ai_search_with_adi/function_apps/indexer/pending_index_trigger.py b/ai_search_with_adi/function_app/pending_index_trigger.py similarity index 100% rename from ai_search_with_adi/function_apps/indexer/pending_index_trigger.py rename to ai_search_with_adi/function_app/pending_index_trigger.py diff --git a/ai_search_with_adi/function_apps/indexer/pre_embedding_cleaner.py b/ai_search_with_adi/function_app/pre_embedding_cleaner.py similarity index 100% rename from ai_search_with_adi/function_apps/indexer/pre_embedding_cleaner.py rename to ai_search_with_adi/function_app/pre_embedding_cleaner.py diff --git a/ai_search_with_adi/function_apps/indexer/requirements.txt b/ai_search_with_adi/function_app/requirements.txt similarity index 70% rename from ai_search_with_adi/function_apps/indexer/requirements.txt rename to ai_search_with_adi/function_app/requirements.txt index 48c9837..adf82be 100644 --- a/ai_search_with_adi/function_apps/indexer/requirements.txt +++ b/ai_search_with_adi/function_app/requirements.txt @@ -17,10 +17,4 @@ azure-ai-documentintelligence azure-ai-textanalytics azure-ai-vision-imageanalysis PyMuPDF -pillow -torch aiohttp -spacy==3.7.5 -transformers -scikit-learn -en-core-web-md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1.tar.gz diff --git a/ai_search_with_adi/function_apps/common/storage_account.py b/ai_search_with_adi/function_app/storage_account.py similarity index 100% rename from ai_search_with_adi/function_apps/common/storage_account.py rename to ai_search_with_adi/function_app/storage_account.py diff --git a/ai_search_with_adi/function_apps/indexer/text_split.py b/ai_search_with_adi/function_app/text_split.py similarity index 100% rename from ai_search_with_adi/function_apps/indexer/text_split.py rename to ai_search_with_adi/function_app/text_split.py From 616899502401a586e84372442584dfd7f6171bc0 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Tue, 10 Sep 2024 16:25:21 +0100 Subject: [PATCH 15/33] Remove uneeded code --- .../function_app/common/ai_search.py | 130 ------- .../common/delay_processing_exception.py | 4 - .../function_app/common/payloads/error.py | 23 -- .../function_app/common/payloads/header.py | 42 --- .../function_app/common/payloads/payload.py | 23 -- .../payloads/pending_index_completion.py | 43 --- .../common/payloads/pending_index_trigger.py | 39 -- .../function_app/common/requirements.txt | 11 - .../function_app/common/service_bus.py | 46 --- .../function_app/function_app.py | 173 +-------- ai_search_with_adi/function_app/ocr.py | 86 ----- .../function_app/pending_index_completion.py | 107 ------ .../function_app/pending_index_trigger.py | 94 ----- ai_search_with_adi/function_app/text_split.py | 355 ------------------ 14 files changed, 1 insertion(+), 1175 deletions(-) delete mode 100644 ai_search_with_adi/function_app/common/ai_search.py delete mode 100644 ai_search_with_adi/function_app/common/delay_processing_exception.py delete mode 100644 ai_search_with_adi/function_app/common/payloads/error.py delete mode 100644 ai_search_with_adi/function_app/common/payloads/header.py delete mode 100644 ai_search_with_adi/function_app/common/payloads/payload.py delete mode 100644 ai_search_with_adi/function_app/common/payloads/pending_index_completion.py delete mode 100644 ai_search_with_adi/function_app/common/payloads/pending_index_trigger.py delete mode 100644 ai_search_with_adi/function_app/common/requirements.txt delete mode 100644 ai_search_with_adi/function_app/common/service_bus.py delete mode 100644 ai_search_with_adi/function_app/ocr.py delete mode 100644 ai_search_with_adi/function_app/pending_index_completion.py delete mode 100644 ai_search_with_adi/function_app/pending_index_trigger.py delete mode 100644 ai_search_with_adi/function_app/text_split.py diff --git a/ai_search_with_adi/function_app/common/ai_search.py b/ai_search_with_adi/function_app/common/ai_search.py deleted file mode 100644 index eedf27e..0000000 --- a/ai_search_with_adi/function_app/common/ai_search.py +++ /dev/null @@ -1,130 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -from azure.search.documents.indexes.aio import SearchIndexerClient, SearchIndexClient -from azure.search.documents.aio import SearchClient -from azure.search.documents.indexes.models import SynonymMap -from azure.identity import DefaultAzureCredential -from azure.core.exceptions import HttpResponseError -import logging -import os -from enum import Enum -from openai import AsyncAzureOpenAI -from azure.search.documents.models import VectorizedQuery - - -class IndexerStatusEnum(Enum): - RETRIGGER = "RETRIGGER" - RUNNING = "RUNNING" - SUCCESS = "SUCCESS" - - -class AISearchHelper: - def __init__(self): - self._client_id = os.environ["FunctionApp__ClientId"] - - self._endpoint = os.environ["AIService__AzureSearchOptions__Endpoint"] - - async def get_index_client(self): - credential = DefaultAzureCredential(managed_identity_client_id=self._client_id) - - return SearchIndexClient(self._endpoint, credential) - - async def get_indexer_client(self): - credential = DefaultAzureCredential(managed_identity_client_id=self._client_id) - - return SearchIndexerClient(self._endpoint, credential) - - async def get_search_client(self, index_name): - credential = DefaultAzureCredential(managed_identity_client_id=self._client_id) - - return SearchClient(self._endpoint, index_name, credential) - - async def upload_synonym_map(self, synonym_map_name: str, synonyms: str): - index_client = await self.get_index_client() - async with index_client: - try: - await index_client.delete_synonym_map(synonym_map_name) - except HttpResponseError as e: - logging.error("Unable to delete synonym map %s", e) - - logging.info("Synonyms: %s", synonyms) - synonym_map = SynonymMap(name=synonym_map_name, synonyms=synonyms) - await index_client.create_synonym_map(synonym_map) - - async def get_indexer_status(self, indexer_name): - indexer_client = await self.get_indexer_client() - async with indexer_client: - try: - status = await indexer_client.get_indexer_status(indexer_name) - - last_execution_result = status.last_result - - if last_execution_result.status == "inProgress": - return IndexerStatusEnum.RUNNING, last_execution_result.start_time - elif last_execution_result.status in ["success", "transientFailure"]: - return IndexerStatusEnum.SUCCESS, last_execution_result.start_time - else: - return IndexerStatusEnum.RETRIGGER, last_execution_result.start_time - except HttpResponseError as e: - logging.error("Unable to get indexer status %s", e) - - async def trigger_indexer(self, indexer_name): - indexer_client = await self.get_indexer_client() - async with indexer_client: - try: - await indexer_client.run_indexer(indexer_name) - except HttpResponseError as e: - logging.error("Unable to run indexer %s", e) - - async def search_index( - self, index_name, semantic_config, search_text, filter_field=None - ): - """Search the index using the provided search text.""" - async with AsyncAzureOpenAI( - # This is the default and can be omitted - api_key=os.environ["AIService__Compass_Key"], - azure_endpoint=os.environ["AIService__Compass_Endpoint"], - api_version="2023-03-15-preview", - ) as open_ai_client: - embeddings = await open_ai_client.embeddings.create( - model=os.environ["AIService__Compass_Models__Embedding"], - input=search_text, - ) - - # Extract the embedding vector - embedding_vector = embeddings.data[0].embedding - - vector_query = VectorizedQuery( - vector=embedding_vector, - k_nearest_neighbors=5, - fields="ChunkEmbedding", - ) - - if filter_field: - filter_expression = f"filter_field eq '{filter_field}'" - else: - filter_expression = None - - logging.info(f"Filter Expression: {filter_expression}") - - search_client = await self.get_search_client(index_name) - async with search_client: - results = await search_client.search( - top=3, - query_type="semantic", - semantic_configuration_name=semantic_config, - search_text=search_text, - select="Title,Chunk", - vector_queries=[vector_query], - filter=filter_expression, - ) - - documents = [ - document - async for result in results.by_page() - async for document in result - ] - - logging.info(f"Documents: {documents}") - return documents diff --git a/ai_search_with_adi/function_app/common/delay_processing_exception.py b/ai_search_with_adi/function_app/common/delay_processing_exception.py deleted file mode 100644 index a8ef226..0000000 --- a/ai_search_with_adi/function_app/common/delay_processing_exception.py +++ /dev/null @@ -1,4 +0,0 @@ -class DelayProcessingException(Exception): - """Exception to delay processing.""" - - pass diff --git a/ai_search_with_adi/function_app/common/payloads/error.py b/ai_search_with_adi/function_app/common/payloads/error.py deleted file mode 100644 index 5a7f443..0000000 --- a/ai_search_with_adi/function_app/common/payloads/error.py +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -from typing import Optional -from pydantic import BaseModel, Field, ConfigDict -from datetime import datetime, timezone - - -class Error(BaseModel): - """Error item model""" - - code: str = Field(..., description="The error code") - message: str = Field(..., description="The error message") - details: Optional[str] = Field( - None, description="Detailed error information from Python" - ) - timestamp: Optional[datetime] = Field( - ..., - description="Creation timestamp in UTC", - default_factory=lambda: datetime.now(timezone.utc), - ) - - __config__ = ConfigDict(extra="ignore") diff --git a/ai_search_with_adi/function_app/common/payloads/header.py b/ai_search_with_adi/function_app/common/payloads/header.py deleted file mode 100644 index c56a39b..0000000 --- a/ai_search_with_adi/function_app/common/payloads/header.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -from pydantic import BaseModel, Field, ConfigDict -from datetime import datetime, timezone -from enum import Enum - - -class DataTypeEnum(Enum): - """Type enum""" - - BUSINESS_GLOSSARY = "business_glossary" - SUMMARY = "summary" - - -class TaskEnum(Enum): - """Task enum""" - - PENDING_INDEX_COMPLETION = "pending_index_completion" - PENDING_INDEX_TRIGGER = "pending_index_trigger" - - -class Header(BaseModel): - """Header model""" - - creation_timestamp: datetime = Field( - ..., - description="Creation timestamp in UTC", - default_factory=lambda: datetime.now(timezone.utc), - ) - last_processed_timestamp: datetime = Field( - ..., - description="Last processed timestamp in UTC", - default_factory=lambda: datetime.now(timezone.utc), - ) - retries_remaining: int = Field( - description="Number of retries remaining", default=10 - ) - data_type: DataTypeEnum = Field(..., description="Data type") - task: TaskEnum = Field(..., description="Task name") - - __config__ = ConfigDict(extra="ignore") diff --git a/ai_search_with_adi/function_app/common/payloads/payload.py b/ai_search_with_adi/function_app/common/payloads/payload.py deleted file mode 100644 index b36f25f..0000000 --- a/ai_search_with_adi/function_app/common/payloads/payload.py +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -from pydantic import BaseModel, ConfigDict -import logging - - -class Payload(BaseModel): - """Body model""" - - @classmethod - def from_service_bus_message(cls, message): - """ - Create a Payload object from a ServiceBusMessage object. - - :param message: The ServiceBusMessage object. - :return: The Body object. - """ - message = message.get_body().decode("utf-8") - logging.info(f"ServiceBus message: {message}") - return cls.model_validate_json(message) - - __config__ = ConfigDict(extra="ignore") diff --git a/ai_search_with_adi/function_app/common/payloads/pending_index_completion.py b/ai_search_with_adi/function_app/common/payloads/pending_index_completion.py deleted file mode 100644 index caf2ade..0000000 --- a/ai_search_with_adi/function_app/common/payloads/pending_index_completion.py +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -from pydantic import BaseModel, Field, ConfigDict -from datetime import datetime, timezone -from typing import Optional, List - -from common.payloads.header import Header -from common.payloads.error import Error -from common.payloads.payload import Payload - - -class PendingIndexCompletionBody(BaseModel): - """Body model""" - - indexer: str = Field(..., description="The indexer to trigger") - id_field: Optional[int] = Field(None, description="The ID field") - blob_storage_url: Optional[str] = Field( - ..., description="The URL to the blob storage" - ) - id_name: Optional[str] = Field( - None, description="The text name for the integer ID field" - ) - business_unit: Optional[str] = Field(None, description="The business unit") - indexer_start_time: Optional[datetime] = Field( - ..., - description="The time the indexer was triggered successfully", - default_factory=lambda: datetime.now(timezone.utc), - ) - - __config__ = ConfigDict(extra="ignore") - - -class PendingIndexCompletionPayload(Payload): - """Pending Index Trigger model""" - - header: Header = Field(..., description="Header information") - body: PendingIndexCompletionBody = Field(..., description="Body information") - errors: List[Error] | None = Field( - ..., description="List of errors", default_factory=list - ) - - __config__ = ConfigDict(extra="ignore") diff --git a/ai_search_with_adi/function_app/common/payloads/pending_index_trigger.py b/ai_search_with_adi/function_app/common/payloads/pending_index_trigger.py deleted file mode 100644 index 199bf36..0000000 --- a/ai_search_with_adi/function_app/common/payloads/pending_index_trigger.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -from pydantic import BaseModel, Field, ConfigDict -from typing import Optional, List - -from common.payloads.header import Header -from common.payloads.error import Error -from common.payloads.payload import Payload - - -class PendingIndexTriggerBody(BaseModel): - """Body model""" - - indexer: str = Field(..., description="The indexer to trigger") - ## this field can be defined based on your id field - id_field: Optional[int] = Field(None, description="The ID field") - blob_storage_url: str = Field(..., description="The URL to the blob storage") - ## this field can be defined based on your id field - id_name: Optional[str] = Field( - None, description="The text name for the integer ID field" - ) - additional_field: Optional[str] = Field( - None, description="Description of additional_field" - ) - - __config__ = ConfigDict(extra="ignore") - - -class PendingIndexTriggerPayload(Payload): - """Pending Index Trigger model""" - - header: Header = Field(..., description="Header information") - body: PendingIndexTriggerBody = Field(..., description="Body information") - errors: List[Error] | None = Field( - ..., description="List of errors", default_factory=list - ) - - __config__ = ConfigDict(extra="ignore") diff --git a/ai_search_with_adi/function_app/common/requirements.txt b/ai_search_with_adi/function_app/common/requirements.txt deleted file mode 100644 index daa8b89..0000000 --- a/ai_search_with_adi/function_app/common/requirements.txt +++ /dev/null @@ -1,11 +0,0 @@ -azure-storage-blob -azure-servicebus -azure-core -azure-identity -pydantic -pymongo -azure-search -azure-search-documents==11.6.0b4 -openai -aiohttp -motor diff --git a/ai_search_with_adi/function_app/common/service_bus.py b/ai_search_with_adi/function_app/common/service_bus.py deleted file mode 100644 index 9e95fe8..0000000 --- a/ai_search_with_adi/function_app/common/service_bus.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import os -import logging -from datetime import datetime, timezone -from azure.identity.aio import DefaultAzureCredential -from azure.servicebus import ServiceBusMessage -from azure.servicebus.aio import ServiceBusClient - - -class ServiceBusHelper: - def __init__(self): - self._client_id = os.environ["FunctionApp__ClientId"] - - self._endpoint = os.environ["ServiceBusTrigger__fullyQualifiedNamespace"] - - async def get_client(self): - credential = DefaultAzureCredential(managed_identity_client_id=self._client_id) - return ServiceBusClient(self._endpoint, credential) - - async def send_message_to_service_bus_queue( - self, queue, payload, enqueue_time=None, retry=False - ): - # update the header - payload.header.last_processed_timestamp = datetime.now(timezone.utc) - payload.header.task = queue - - if retry: - payload.header.retries_remaining -= 1 - try: - service_bus_client = await self.get_client() - async with service_bus_client: - sender = service_bus_client.get_queue_sender(queue_name=queue.value) - - async with sender: - message = ServiceBusMessage( - body=payload.model_dump_json(), - scheduled_enqueue_time_utc=enqueue_time, - ) - await sender.send_messages(message) - logging.info( - f"Sent a message to the Azure Service Bus queue: {queue}" - ) - except Exception as e: - logging.error(f"Failed to send message to the Azure Service Bus queue: {e}") diff --git a/ai_search_with_adi/function_app/function_app.py b/ai_search_with_adi/function_app/function_app.py index eacb1e3..10278e6 100644 --- a/ai_search_with_adi/function_app/function_app.py +++ b/ai_search_with_adi/function_app/function_app.py @@ -1,100 +1,21 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. - -from datetime import datetime, timedelta, timezone import azure.functions as func import logging import json import asyncio from adi_2_ai_search import process_adi_2_ai_search -from common.service_bus import ServiceBusHelper from pre_embedding_cleaner import process_pre_embedding_cleaner -from text_split import process_text_split -from ai_search_2_compass import process_ai_search_2_compass -from key_phrase_extraction import process_key_phrase_extraction -from ocr import process_ocr -from pending_index_completion import process_pending_index_completion -from pending_index_trigger import process_pending_index_trigger -from common.payloads.pending_index_trigger import PendingIndexTriggerPayload +from key_phrase_extraction import process_key_phrase_extraction -from common.payloads.header import TaskEnum logging.basicConfig(level=logging.INFO) app = func.FunctionApp(http_auth_level=func.AuthLevel.FUNCTION) -@app.route(route="text_split", methods=[func.HttpMethod.POST]) -async def text_split(req: func.HttpRequest) -> func.HttpResponse: - """Extract the content from a document using ADI.""" - - try: - req_body = req.get_json() - values = req_body.get("values") - text_split_config = req.headers - except ValueError: - return func.HttpResponse( - "Please valid Custom Skill Payload in the request body", status_code=400 - ) - else: - logging.debug(f"Input Values: {values}") - - record_tasks = [] - - for value in values: - record_tasks.append( - asyncio.create_task(process_text_split(value, text_split_config)) - ) - - results = await asyncio.gather(*record_tasks) - logging.debug(f"Results: {results}") - - return func.HttpResponse( - json.dumps({"values": results}), - status_code=200, - mimetype="application/json", - ) - - -@app.route(route="ai_search_2_compass", methods=[func.HttpMethod.POST]) -async def ai_search_2_compass(req: func.HttpRequest) -> func.HttpResponse: - logging.info("Python HTTP trigger function processed a request.") - - """HTTP trigger for AI Search 2 Compass function. - - Args: - req (func.HttpRequest): The HTTP request object. - - Returns: - func.HttpResponse: The HTTP response object.""" - logging.info("Python HTTP trigger function processed a request.") - - try: - req_body = req.get_json() - values = req_body.get("values") - except ValueError: - return func.HttpResponse( - "Please valid Custom Skill Payload in the request body", status_code=400 - ) - else: - logging.debug("Input Values: %s", values) - - record_tasks = [] - - for value in values: - record_tasks.append(asyncio.create_task(process_ai_search_2_compass(value))) - - results = await asyncio.gather(*record_tasks) - logging.debug("Results: %s", results) - vectorised_tasks = {"values": results} - - return func.HttpResponse( - json.dumps(vectorised_tasks), status_code=200, mimetype="application/json" - ) - - @app.route(route="adi_2_ai_search", methods=[func.HttpMethod.POST]) async def adi_2_ai_search(req: func.HttpRequest) -> func.HttpResponse: """Extract the content from a document using ADI.""" @@ -205,95 +126,3 @@ async def key_phrase_extractor(req: func.HttpRequest) -> func.HttpResponse: return func.HttpResponse( json.dumps(cleaned_tasks), status_code=200, mimetype="application/json" ) - - -@app.route(route="ocr", methods=[func.HttpMethod.POST]) -async def ocr(req: func.HttpRequest) -> func.HttpResponse: - """HTTP trigger for data cleanup function. - - Args: - req (func.HttpRequest): The HTTP request object. - - Returns: - func.HttpResponse: The HTTP response object.""" - logging.info("Python HTTP trigger data cleanup function processed a request.") - - try: - req_body = req.get_json() - values = req_body.get("values") - except ValueError: - return func.HttpResponse( - "Please valid Custom Skill Payload in the request body", status_code=400 - ) - else: - logging.debug("Input Values: %s", values) - - record_tasks = [] - - for value in values: - record_tasks.append(asyncio.create_task(process_ocr(value))) - - results = await asyncio.gather(*record_tasks) - logging.debug("Results: %s", results) - cleaned_tasks = {"values": results} - - return func.HttpResponse( - json.dumps(cleaned_tasks), status_code=200, mimetype="application/json" - ) - - -@app.service_bus_queue_trigger( - arg_name="msg", - queue_name="pending_index_trigger", - connection="ServiceBusTrigger", -) -async def pending_index_trigger(msg: func.ServiceBusMessage): - logging.info( - f"trigger-indexer: Python ServiceBus queue trigger processed message: {msg}" - ) - try: - payload = PendingIndexTriggerPayload.from_service_bus_message(msg) - await process_pending_index_trigger(payload) - except ValueError as ve: - logging.error(f"ValueError: {ve}") - except Exception as e: - logging.error(f"Error processing ServiceBus message: {e}") - - if "On-demand indexer invocation is permitted every 180 seconds" in str(e): - logging.warning( - f"Indexer invocation limit reached: {e}. Scheduling a retry." - ) - service_bus_helper = ServiceBusHelper() - message = PendingIndexTriggerPayload( - header=payload.header, body=payload.body, errors=[] - ) - queue = TaskEnum.PENDING_INDEX_TRIGGER.value - minutes = 2 ** (11 - payload.header.retries_remaining) - enqueue_time = datetime.now(timezone.utc) + timedelta(minutes=minutes) - await service_bus_helper.send_message_to_service_bus_queue( - queue, message, enqueue_time=enqueue_time - ) - else: - raise e - - -@app.service_bus_queue_trigger( - arg_name="msg", - queue_name="pending_index_completion", - connection="ServiceBusTrigger", -) -async def pending_index_completion(msg: func.ServiceBusMessage): - logging.info( - f"indexer-polling-trigger: Python ServiceBus queue trigger processed message: {msg}" - ) - - try: - payload = PendingIndexTriggerPayload.from_service_bus_message(msg) - await process_pending_index_completion(payload) - except ValueError as ve: - logging.error(f"ValueError: {ve}") - except Exception as e: - logging.error(f"Error processing ServiceBus message: {e}") - if "The operation has timed out" in str(e): - logging.error("The operation has timed out.") - raise e diff --git a/ai_search_with_adi/function_app/ocr.py b/ai_search_with_adi/function_app/ocr.py deleted file mode 100644 index e179eb1..0000000 --- a/ai_search_with_adi/function_app/ocr.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import logging -import os -from azure.ai.vision.imageanalysis.aio import ImageAnalysisClient -from azure.ai.vision.imageanalysis.models import VisualFeatures -from azure.core.credentials import AzureKeyCredential - - -async def process_ocr(record: dict) -> dict: - logging.info("Python HTTP trigger function processed a request.") - - try: - url = record["data"]["image"]["url"] - logging.info(f"Request Body: {record}") - except KeyError: - return { - "recordId": record["recordId"], - "data": {}, - "errors": [ - { - "message": "Failed to extract data with ocr. Pass a valid source in the request body.", - } - ], - "warnings": None, - } - else: - logging.info(f"image url: {url}") - - if url is not None: - try: - client = ImageAnalysisClient( - endpoint=os.environ["AIService__Services__Endpoint"], - credential=AzureKeyCredential( - os.environ["AIService__Services__Key"] - ), - ) - result = await client.analyze_from_url( - image_url=url, visual_features=[VisualFeatures.READ] - ) - logging.info("logging output") - - # Extract text from OCR results - text = " ".join([line.text for line in result.read.blocks[0].lines]) - logging.info(text) - - except KeyError as e: - logging.error(e) - logging.error(f"Failed to authenticate with ocr: {e}") - return { - "recordId": record["recordId"], - "data": {}, - "errors": [ - { - "message": f"Failed to authenticate with Ocr. Check the service credentials exist. {e}", - } - ], - "warnings": None, - } - except Exception as e: - logging.error(e) - logging.error( - f"Failed to analyze the document with Azure Document Intelligence: {e}" - ) - logging.error(e.InnerError) - return { - "recordId": record["recordId"], - "data": {}, - "errors": [ - { - "message": f"Failed to analyze the document with ocr. Check the source and try again. {e}", - } - ], - "warnings": None, - } - else: - return { - "recordId": record["recordId"], - "data": {"text": ""}, - } - - return { - "recordId": record["recordId"], - "data": {"text": text}, - } diff --git a/ai_search_with_adi/function_app/pending_index_completion.py b/ai_search_with_adi/function_app/pending_index_completion.py deleted file mode 100644 index 4c4d1ba..0000000 --- a/ai_search_with_adi/function_app/pending_index_completion.py +++ /dev/null @@ -1,107 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -from common.ai_search import AISearchHelper, IndexerStatusEnum -from common.service_bus import ServiceBusHelper -from common.payloads.pending_index_completion import PendingIndexCompletionPayload -from common.payloads.pending_index_trigger import PendingIndexTriggerPayload -from common.payloads.header import TaskEnum -from common.payloads.error import Error -from datetime import datetime, timedelta, timezone -from common.delay_processing_exception import DelayProcessingException -import asyncio - - -async def process_pending_index_completion(payload: PendingIndexCompletionPayload): - """Process the pending index completion.""" - ai_search_helper = AISearchHelper() - service_bus_helper = ServiceBusHelper() - - status, indexer_start_time = await ai_search_helper.get_indexer_status( - payload.body.indexer - ) - request_time = payload.header.creation_timestamp - enqueue_time = None - queue = None - messages = [] - retry = False - - if status == IndexerStatusEnum.RETRIGGER and payload.header.retries_remaining > 0: - # Trigger the indexer - await ai_search_helper.trigger_indexer(payload.body.indexer) - - errors = [error_item.model_dump() for error_item in payload.errors] - errors.append( - Error( - code="IndexerNotCompleted", - message="Indexer was was in failed state and required retriggering.", - ) - ) - messages.append( - PendingIndexCompletionPayload( - header=payload.header.model_dump(), - body=payload.body.model_dump(), - errors=errors, - ) - ) - queue = TaskEnum.PENDING_INDEX_COMPLETION - minutes = 2 ** (11 - payload.header.retries_remaining) - enqueue_time = datetime.now(timezone.utc) + timedelta(minutes=minutes) - retry = True - elif status == IndexerStatusEnum.RUNNING and payload.header.retries_remaining > 0: - errors = [error_item.model_dump() for error_item in payload.errors] - errors.append( - Error( - code="IndexerNotCompleted", - message="Indexer was completed not at the time of running.", - ) - ) - messages.append( - PendingIndexCompletionPayload( - header=payload.header.model_dump(), - body=payload.body.model_dump(), - errors=errors, - ) - ) - queue = TaskEnum.PENDING_INDEX_COMPLETION - minutes = 2 ** (11 - payload.header.retries_remaining) - enqueue_time = datetime.now(timezone.utc) + timedelta(minutes=minutes) - retry = True - elif ( - status == IndexerStatusEnum.SUCCESS - and indexer_start_time <= request_time - and payload.header.retries_remaining > 0 - ): - errors = [error_item.model_dump() for error_item in payload.errors] - errors.append( - Error( - code="IndexerNotTriggered", - message="Indexer was not triggered.", - ) - ) - messages.append( - PendingIndexTriggerPayload( - header=payload.header.model_dump(), - body=payload.body.model_dump(), - errors=errors, - ) - ) - queue = TaskEnum.PENDING_INDEX_TRIGGER - minutes = 2 ** (11 - payload.header.retries_remaining) - enqueue_time = datetime.now(timezone.utc) + timedelta(minutes=minutes) - retry = True - else: - raise DelayProcessingException( - "Failed to run trigger due to maximum retries exceeded." - ) - - if queue is not None and len(messages) > 0: - message_tasks = [] - for message in messages: - message_tasks.append( - service_bus_helper.send_message_to_service_bus_queue( - queue, message, enqueue_time=enqueue_time, retry=retry - ) - ) - - await asyncio.gather(*message_tasks) diff --git a/ai_search_with_adi/function_app/pending_index_trigger.py b/ai_search_with_adi/function_app/pending_index_trigger.py deleted file mode 100644 index f803623..0000000 --- a/ai_search_with_adi/function_app/pending_index_trigger.py +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -from common.ai_search import AISearchHelper, IndexerStatusEnum -from common.service_bus import ServiceBusHelper -from common.payloads.pending_index_trigger import PendingIndexTriggerPayload -from common.payloads.pending_index_completion import PendingIndexCompletionPayload -from common.payloads.header import TaskEnum -from datetime import datetime, timedelta, timezone -from common.delay_processing_exception import DelayProcessingException -from common.payloads.error import Error - - -async def process_pending_index_trigger(payload: PendingIndexTriggerPayload): - """Process the pending index trigger.""" - - ai_search_helper = AISearchHelper() - service_bus_helper = ServiceBusHelper() - - status, indexer_start_time = await ai_search_helper.get_indexer_status( - payload.body.indexer - ) - request_time = payload.header.last_processed_timestamp - enqueue_time = None - queue = None - message = None - retry = False - - if status == IndexerStatusEnum.SUCCESS and indexer_start_time > request_time: - errors = [error_item.model_dump() for error_item in payload.errors] - message = PendingIndexCompletionPayload( - header=payload.header.model_dump(), - body=payload.body.model_dump(), - errors=errors, - ) - queue = TaskEnum.PENDING_INDEX_COMPLETION - elif status == IndexerStatusEnum.RETRIGGER or status == IndexerStatusEnum.SUCCESS: - # Trigger the indexer - await ai_search_helper.trigger_indexer(payload.body.indexer) - - errors = [error_item.model_dump() for error_item in payload.errors] - - if status == IndexerStatusEnum.RETRIGGER: - errors.append( - Error( - code="IndexerNotCompleted", - message="Indexer was was in failed state and required retriggering.", - ) - ) - - message = PendingIndexCompletionPayload( - header=payload.header.model_dump(), - body=payload.body.model_dump(), - errors=errors, - ) - queue = TaskEnum.PENDING_INDEX_COMPLETION - elif status == IndexerStatusEnum.RUNNING and indexer_start_time > request_time: - errors = [error_item.model_dump() for error_item in payload.errors] - message = PendingIndexCompletionPayload( - header=payload.header.model_dump(), - body=payload.body.model_dump(), - errors=errors, - ) - queue = TaskEnum.PENDING_INDEX_COMPLETION - elif ( - status == IndexerStatusEnum.RUNNING - and indexer_start_time <= request_time - and payload.header.retries_remaining > 0 - ): - errors = [error_item.model_dump() for error_item in payload.errors] - errors.append( - Error( - code="IndexerAlreadyRunning", - message="Indexer is already running for an outstanding request.", - ) - ) - message = PendingIndexTriggerPayload( - header=payload.header.model_dump(), - body=payload.body.model_dump(), - errors=errors, - ) - queue = TaskEnum.PENDING_INDEX_TRIGGER - minutes = 2 ** (11 - payload.header.retries_remaining) - enqueue_time = datetime.now(timezone.utc) + timedelta(minutes=minutes) - retry = True - else: - raise DelayProcessingException( - "Failed to run trigger due to maximum retries exceeded." - ) - - if queue is not None: - await service_bus_helper.send_message_to_service_bus_queue( - queue, message, enqueue_time=enqueue_time, retry=retry - ) diff --git a/ai_search_with_adi/function_app/text_split.py b/ai_search_with_adi/function_app/text_split.py deleted file mode 100644 index 8121c70..0000000 --- a/ai_search_with_adi/function_app/text_split.py +++ /dev/null @@ -1,355 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import spacy -import logging -from transformers import AutoTokenizer, AutoModelForSeq2SeqLM -import json -from sklearn.metrics.pairwise import cosine_similarity - -nlp = spacy.load("en_core_web_md") - - -class RecursiveCharacterTextSplitter: - def __init__(self, fragment_size=100, division_chars=["\n\n", "\n", " ", ""]): - self.fragment_size = fragment_size - self.division_chars = division_chars - - def split_text(self, text): - return self._recursive_split(text, 0) - - def _recursive_split(self, text, char_idx): - if len(text) <= self.fragment_size or char_idx >= len(self.division_chars): - return [text] - - char = self.division_chars[char_idx] - fragments = text.split(char) - result = [] - current_fragment = "" - - for fragment in fragments: - if len(current_fragment) + len(fragment) + len(char) <= self.fragment_size: - current_fragment += char + fragment - else: - if current_fragment: - result.append(current_fragment) - current_fragment = fragment - - if current_fragment: - result.append(current_fragment) - - if any(len(frag) > self.fragment_size for frag in result): - return self._recursive_split(text, char_idx + 1) - - return result - - -class CharacterTextSplitter: - def __init__(self, fragment_size=100, separator=" "): - self.fragment_size = fragment_size - self.separator = separator - - def split_text(self, text): - fragments = text.split(self.separator) - result = [] - current_fragment = "" - - for fragment in fragments: - if ( - len(current_fragment) + len(fragment) + len(self.separator) - <= self.fragment_size - ): - current_fragment += self.separator + fragment - else: - if current_fragment: - result.append(current_fragment) - current_fragment = fragment - - if current_fragment: - result.append(current_fragment) - - return result - - -class RecursiveTextSplitter: - def __init__(self, fragment_size=100, division_tokens=["\n\n", "\n", " ", ""]): - self.fragment_size = fragment_size - self.division_tokens = division_tokens - - def split_text(self, text): - return self._recursive_split(text, 0) - - def _recursive_split(self, text, token_idx): - if len(text) <= self.fragment_size or token_idx >= len(self.division_tokens): - return [text] - - token = self.division_tokens[token_idx] - fragments = text.split(token) - result = [] - current_fragment = "" - - for fragment in fragments: - if len(current_fragment) + len(fragment) + len(token) <= self.fragment_size: - current_fragment += token + fragment - else: - if current_fragment: - result.append(current_fragment) - current_fragment = fragment - - if current_fragment: - result.append(current_fragment) - - if any(len(frag) > self.fragment_size for frag in result): - return self._recursive_split(text, token_idx + 1) - - return result - - -class SemanticDoubleMergingSplitterNodeParser: - def __init__( - self, - initial_threshold=0.8, - appending_threshold=0.7, - merging_threshold=0.75, - fragment_size=100, - spacy_model="en_core_web_md", - ): - self.initial_threshold = initial_threshold - self.appending_threshold = appending_threshold - self.merging_threshold = merging_threshold - self.fragment_size = fragment_size - try: - self.nlp = spacy.load(spacy_model) - except IOError: - raise ValueError( - f"Spacy model '{spacy_model}' not found. Please download it using 'python -m spacy download {spacy_model}'" - ) - - def split_text(self, text): - sentences = self._split_into_sentences(text) - initial_chunks = self._initial_pass(sentences) - final_chunks = self._second_pass(initial_chunks) - return final_chunks - - def _split_into_sentences(self, text): - doc = self.nlp(text) - sentences = [sent.text for sent in doc.sents] - return sentences - - def _initial_pass(self, sentences): - chunks = [] - current_chunk = [] - - i = 0 - while i < len(sentences): - current_chunk.append(sentences[i]) - if len(current_chunk) >= 2: - cosine_sim = self._cosine_similarity( - " ".join(current_chunk[-2:]), sentences[i] - ) - if ( - cosine_sim < self.initial_threshold - or len(" ".join(current_chunk)) > self.fragment_size - ): - if len(current_chunk) > 2: - chunks.append(" ".join(current_chunk[:-1])) - current_chunk = [current_chunk[-1]] - else: - chunks.append(current_chunk[0]) - current_chunk = [current_chunk[1]] - i += 1 - - if current_chunk: - chunks.append(" ".join(current_chunk)) - - return chunks - - def _second_pass(self, chunks): - merged_chunks = [] - current_chunk = chunks[0] - - i = 1 - while i < len(chunks): - cosine_sim = self._cosine_similarity(current_chunk, chunks[i]) - if ( - cosine_sim >= self.merging_threshold - and len(current_chunk + " " + chunks[i]) <= self.fragment_size - ): - current_chunk += " " + chunks[i] - else: - merged_chunks.append(current_chunk) - current_chunk = chunks[i] - i += 1 - - merged_chunks.append(current_chunk) - return merged_chunks - - def _cosine_similarity(self, text1, text2): - vec1 = self.nlp(text1).vector - vec2 = self.nlp(text2).vector - return cosine_similarity([vec1], [vec2])[0, 0] - - -class FlanT5Chunker: - def __init__( - self, model_name="chentong00/propositionizer-wiki-flan-t5-large", device="cpu" - ): - self.tokenizer = AutoTokenizer.from_pretrained(model_name) - self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device) - self.device = device - self.max_length = 512 # Model's maximum token length - - def flan_t5_chunking(self, text, chunk_size=500, stride=20): - input_text = f"Title: . Section: . Content: {text}" - input_ids = self.tokenizer(input_text, return_tensors="pt").input_ids.to( - self.device - ) - total_length = input_ids.shape[1] - - chunks = [] - for i in range(0, total_length, chunk_size - stride): - end = min(i + chunk_size, total_length) - chunk_input_ids = input_ids[:, i:end] - outputs = self.model.generate( - chunk_input_ids, max_new_tokens=self.max_length - ).cpu() - output_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True) - try: - prop_list = json.loads(output_text) - except json.JSONDecodeError: - prop_list = [] - print("[ERROR] Failed to parse output text as JSON.") - chunks.append(prop_list) - - # Flatten the list of lists - return [item for sublist in chunks for item in sublist] - - -def clean_input(value): - """Clean the input value. - - Args: - value: The input value. - - Returns: - The cleaned value.""" - if isinstance(value, str): - return value.strip('"') - return value - - -async def process_text_split(record: dict, text_split_config: dict) -> dict: - """Process the text split request. - - Args: - record (dict): The request record. - text_split_config (dict): The headers for config. - - Returns: - dict: The response record. - """ - try: - data = record["data"] - text = clean_input(data.get("text")) - logging.info(f"Request Body: {record}") - except KeyError: - return { - "recordId": record["recordId"], - "data": {}, - "errors": [ - { - "message": "Failed to split text. Pass valid parameters.", - } - ], - "warnings": None, - } - else: - if text is None: - logging.error("Failed to split text. Pass valid text.") - return { - "recordId": record["recordId"], - "data": {}, - "errors": [ - { - "message": "Failed to split text. Pass valid text.", - } - ], - "warnings": None, - } - - splitter_type = clean_input( - text_split_config.get("text_split_mode", "recursive_character") - ) - fragment_size = float( - clean_input(text_split_config.get("maximum_page_length", 100)) - ) - separator = clean_input(text_split_config.get("separator", " ")) - initial_threshold = float( - clean_input(text_split_config.get("initial_threshold", 0.8)) - ) - appending_threshold = float( - clean_input(text_split_config.get("appending_threshold", 0.7)) - ) - merging_threshold = float( - clean_input(text_split_config.get("merging_threshold", 0.75)) - ) - - try: - if splitter_type == "recursive_character": - splitter = RecursiveCharacterTextSplitter(fragment_size=fragment_size) - elif splitter_type == "character": - splitter = CharacterTextSplitter( - fragment_size=fragment_size, separator=separator - ) - elif splitter_type == "recursive": - splitter = RecursiveTextSplitter(fragment_size=fragment_size) - elif splitter_type == "semantic": - splitter = SemanticDoubleMergingSplitterNodeParser( - initial_threshold=initial_threshold, - appending_threshold=appending_threshold, - merging_threshold=merging_threshold, - fragment_size=fragment_size, - ) - elif splitter_type == "flan_t5": - splitter = FlanT5Chunker() - else: - logging.error("Failed to split text. Pass valid splitter type.") - logging.error(f"Splitter Type: {splitter_type}") - return { - "recordId": record["recordId"], - "data": {}, - "errors": [ - { - "message": "Failed to split text. Pass valid splitter type.", - } - ], - "warnings": None, - } - - if splitter_type == "flan_t5": - chunks = splitter.flan_t5_chunking(text) - else: - chunks = splitter.split_text(text) - except Exception as e: - logging.error(f"Error during splitting: {e}") - - return { - "recordId": record["recordId"], - "data": {}, - "errors": [ - { - "message": f"Failed to split text. Check function app logs for more details of exact failure. {str(e)}", - } - ], - "warnings": None, - } - - else: - return { - "recordId": record["recordId"], - "data": { - "chunks": chunks, - }, - "errors": None, - "warnings": None, - } From 7f27d9359db39feff7ef74f132ef024c9e6773e2 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Tue, 10 Sep 2024 17:09:36 +0100 Subject: [PATCH 16/33] Update the function app code --- .vscode/extensions.json | 6 + .vscode/launch.json | 15 ++ .vscode/settings.json | 7 + .vscode/tasks.json | 15 ++ ai_search_with_adi/ai_search/.env | 2 +- ai_search_with_adi/ai_search/ai_search.py | 2 +- ai_search_with_adi/ai_search/environment.py | 2 +- ai_search_with_adi/ai_search/requirements.txt | 5 + ai_search_with_adi/function_app/.env | 9 ++ ai_search_with_adi/function_app/.funcignore | 8 + ai_search_with_adi/function_app/README.md | 0 .../function_app/adi_2_ai_search.py | 54 +++++-- .../function_app/environment.py | 30 ++++ .../function_app/function_app.py | 8 +- ai_search_with_adi/function_app/host.json | 16 ++ .../function_app/key_phrase_extraction.py | 153 +++++++++--------- .../function_app/requirements.txt | 1 + .../function_app/storage_account.py | 37 +++-- 18 files changed, 257 insertions(+), 113 deletions(-) create mode 100644 .vscode/extensions.json create mode 100644 .vscode/launch.json create mode 100644 .vscode/settings.json create mode 100644 .vscode/tasks.json create mode 100644 ai_search_with_adi/ai_search/requirements.txt create mode 100644 ai_search_with_adi/function_app/.env create mode 100644 ai_search_with_adi/function_app/.funcignore create mode 100644 ai_search_with_adi/function_app/README.md create mode 100644 ai_search_with_adi/function_app/environment.py create mode 100644 ai_search_with_adi/function_app/host.json diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 0000000..cbbad0f --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,6 @@ +{ + "recommendations": [ + "ms-azuretools.vscode-azurefunctions", + "ms-python.python" + ] +} diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..7ff8568 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,15 @@ +{ + "configurations": [ + { + "connect": { + "host": "localhost", + "port": 9091 + }, + "name": "Attach to Python Functions", + "preLaunchTask": "func: host start", + "request": "attach", + "type": "debugpy" + } + ], + "version": "0.2.0" +} diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..4d62d59 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,7 @@ +{ + "azureFunctions.projectLanguage": "Python", + "azureFunctions.projectLanguageModel": 2, + "azureFunctions.projectRuntime": "~4", + "azureFunctions.scmDoBuildDuringDeployment": true, + "debug.internalConsoleOptions": "neverOpen" +} diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 0000000..f9a5026 --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,15 @@ +{ + "tasks": [ + { + "command": "host start", + "isBackground": true, + "label": "func: host start", + "options": { + "cwd": "${workspaceFolder}/ai_search_with_adi/function_app" + }, + "problemMatcher": "$func-python-watch", + "type": "func" + } + ], + "version": "2.0.0" +} diff --git a/ai_search_with_adi/ai_search/.env b/ai_search_with_adi/ai_search/.env index f7efa45..faeeed4 100644 --- a/ai_search_with_adi/ai_search/.env +++ b/ai_search_with_adi/ai_search/.env @@ -4,7 +4,7 @@ FunctionApp__PreEmbeddingCleaner__FunctionName=pre_embedding_cleaner FunctionApp__ADI__FunctionName=adi_2_ai_search FunctionApp__KeyPhraseExtractor__FunctionName=key_phrase_extractor FunctionApp__AppRegistrationResourceId= -AIService__AzureSearchOptions__IdentityType= # system_assigned or user_assigned or key +IdentityType= # system_assigned or user_assigned or key AIService__AzureSearchOptions__Endpoint= AIService__AzureSearchOptions__Identity__ClientId= AIService__AzureSearchOptions__Key= diff --git a/ai_search_with_adi/ai_search/ai_search.py b/ai_search_with_adi/ai_search/ai_search.py index f392400..ee1fa99 100644 --- a/ai_search_with_adi/ai_search/ai_search.py +++ b/ai_search_with_adi/ai_search/ai_search.py @@ -374,7 +374,7 @@ def get_key_phrase_extraction_skill(self, context, source) -> WebApiSkill: InputFieldMappingEntry(name="text", source=source), ] key_phrase_extraction__skill_outputs = [ - OutputFieldMappingEntry(name="keyPhrases", target_name="keywords") + OutputFieldMappingEntry(name="key_phrases", target_name="keywords") ] key_phrase_extraction_skill = WebApiSkill( name="Key phrase extraction API", diff --git a/ai_search_with_adi/ai_search/environment.py b/ai_search_with_adi/ai_search/environment.py index 9729387..035facf 100644 --- a/ai_search_with_adi/ai_search/environment.py +++ b/ai_search_with_adi/ai_search/environment.py @@ -54,7 +54,7 @@ def identity_type(self) -> IdentityType: Returns: IdentityType: The identity type """ - identity = os.environ.get("AIService__AzureSearchOptions__IdentityType") + identity = os.environ.get("IdentityType") if identity == "user_assigned": return IdentityType.USER_ASSIGNED diff --git a/ai_search_with_adi/ai_search/requirements.txt b/ai_search_with_adi/ai_search/requirements.txt new file mode 100644 index 0000000..4a3e38d --- /dev/null +++ b/ai_search_with_adi/ai_search/requirements.txt @@ -0,0 +1,5 @@ +python-dotenv +azure-search-documents==11.6.0b4 +azure-storage-blob +azure-identity +azure-mgmt-web diff --git a/ai_search_with_adi/function_app/.env b/ai_search_with_adi/function_app/.env new file mode 100644 index 0000000..fdf51e9 --- /dev/null +++ b/ai_search_with_adi/function_app/.env @@ -0,0 +1,9 @@ +AIService__Services__Endpoint= +AIService__Services__Key= +IdentityType= # system_assigned or user_assigned or key +StorageAccount__ConnectionString= +OpenAI__ApiKey= +OpenAI__Endpoint= +OpenAI__MultiModalDeployment= +OpenAI__ApiVersion= +FunctionApp__ClientId= diff --git a/ai_search_with_adi/function_app/.funcignore b/ai_search_with_adi/function_app/.funcignore new file mode 100644 index 0000000..f1110d3 --- /dev/null +++ b/ai_search_with_adi/function_app/.funcignore @@ -0,0 +1,8 @@ +.git* +.vscode +__azurite_db*__.json +__blobstorage__ +__queuestorage__ +local.settings.json +test +.venv diff --git a/ai_search_with_adi/function_app/README.md b/ai_search_with_adi/function_app/README.md new file mode 100644 index 0000000..e69de29 diff --git a/ai_search_with_adi/function_app/adi_2_ai_search.py b/ai_search_with_adi/function_app/adi_2_ai_search.py index 0bf695e..bb41f25 100644 --- a/ai_search_with_adi/function_app/adi_2_ai_search.py +++ b/ai_search_with_adi/function_app/adi_2_ai_search.py @@ -1,6 +1,6 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. - +from azure.identity import DefaultAzureCredential, get_bearer_token_provider import base64 from azure.core.credentials import AzureKeyCredential from azure.ai.documentintelligence.aio import DocumentIntelligenceClient @@ -17,6 +17,7 @@ import json from openai import AsyncAzureOpenAI import openai +from environment import IdentityType, get_identity_type def crop_image_from_pdf_page(pdf_path, page_number, bounding_box): @@ -134,23 +135,42 @@ async def understand_image_with_gptv(image_base64, caption, tries_left=3): """ MAX_TOKENS = 2000 - api_key = os.environ["AzureAI_GPT4V_Key"] - api_version = os.environ["AzureAI__GPT4V_Version"] - deployment_name = os.environ["AzureAI__GPT4V_Deployment"] - api_base = os.environ["AzureAI__GPT4V_APIbase"] + api_version = os.environ.get("OpenAI__ApiVersion") + model = os.environ.get("OpenAI__MultiModalDeployment") + + if get_identity_type() != IdentityType.SYSTEM_ASSIGNED: + token_provider = get_bearer_token_provider( + DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default" + ) + api_key = None + elif get_identity_type() != IdentityType.USER_ASSIGNED: + token_provider = get_bearer_token_provider( + DefaultAzureCredential( + managed_identity_client_id=os.environ["FunctionApp__ClientId"] + ), + "https://cognitiveservices.azure.com/.default", + ) + api_key = None + else: + token_provider = None + api_key = os.environ.get("OpenAI__ApiKey") try: async with AsyncAzureOpenAI( api_key=api_key, api_version=api_version, - base_url=f"{api_base}/openai/deployments/{deployment_name}", + azure_ad_token_provider=token_provider, + azure_endpoint=os.environ.get("OpenAI__AzureEndpoint"), ) as client: # We send both image caption and the image body to GPTv for better understanding if caption != "": response = await client.chat.completions.create( - model=deployment_name, + model=model, messages=[ - {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "system", + "content": "You are an expert in image analysis. Use your experience and skills to provided a detailed description of any provided images. You should focus on what info can be inferred from the image and the meaning of the data inside the image.", + }, { "role": "user", "content": [ @@ -170,9 +190,12 @@ async def understand_image_with_gptv(image_base64, caption, tries_left=3): else: response = await client.chat.completions.create( - model=deployment_name, + model=model, messages=[ - {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "system", + "content": "You are an expert in image analysis. Use your experience and skills to provided a detailed description of any provided images. You should focus on what info can be inferred from the image and the meaning of the data inside the image.", + }, { "role": "user", "content": [ @@ -323,9 +346,18 @@ async def analyse_document(file_path: str) -> AnalyzeResult: with open(file_path, "rb") as f: file_read = f.read() + if get_identity_type() == IdentityType.SYSTEM_ASSIGNED: + credential = DefaultAzureCredential() + elif get_identity_type() == IdentityType.USER_ASSIGNED: + credential = DefaultAzureCredential( + managed_identity_client_id=os.environ["FunctionApp__ClientId"] + ) + else: + credential = AzureKeyCredential(os.environ["AIService__Services__Key"]) + async with DocumentIntelligenceClient( endpoint=os.environ["AIService__Services__Endpoint"], - credential=AzureKeyCredential(os.environ["AIService__Services__Key"]), + credential=credential, ) as document_intelligence_client: poller = await document_intelligence_client.begin_analyze_document( model_id="prebuilt-layout", diff --git a/ai_search_with_adi/function_app/environment.py b/ai_search_with_adi/function_app/environment.py new file mode 100644 index 0000000..232254e --- /dev/null +++ b/ai_search_with_adi/function_app/environment.py @@ -0,0 +1,30 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +import os +from enum import Enum + + +class IdentityType(Enum): + """The type of the indexer""" + + USER_ASSIGNED = "user_assigned" + SYSTEM_ASSIGNED = "system_assigned" + KEY = "key" + + +def get_identity_type() -> IdentityType: + """This function returns the identity type. + + Returns: + IdentityType: The identity type + """ + identity = os.environ.get("IdentityType") + + if identity == "user_assigned": + return IdentityType.USER_ASSIGNED + elif identity == "system_assigned": + return IdentityType.SYSTEM_ASSIGNED + elif identity == "key": + return IdentityType.KEY + else: + raise ValueError("Invalid identity type") diff --git a/ai_search_with_adi/function_app/function_app.py b/ai_search_with_adi/function_app/function_app.py index 10278e6..52cc66e 100644 --- a/ai_search_with_adi/function_app/function_app.py +++ b/ai_search_with_adi/function_app/function_app.py @@ -7,11 +7,8 @@ from adi_2_ai_search import process_adi_2_ai_search from pre_embedding_cleaner import process_pre_embedding_cleaner - - from key_phrase_extraction import process_key_phrase_extraction - logging.basicConfig(level=logging.INFO) app = func.FunctionApp(http_auth_level=func.AuthLevel.FUNCTION) @@ -121,8 +118,9 @@ async def key_phrase_extractor(req: func.HttpRequest) -> func.HttpResponse: results = await asyncio.gather(*record_tasks) logging.debug("Results: %s", results) - cleaned_tasks = {"values": results} return func.HttpResponse( - json.dumps(cleaned_tasks), status_code=200, mimetype="application/json" + json.dumps({"values": results}), + status_code=200, + mimetype="application/json", ) diff --git a/ai_search_with_adi/function_app/host.json b/ai_search_with_adi/function_app/host.json new file mode 100644 index 0000000..20e5f3c --- /dev/null +++ b/ai_search_with_adi/function_app/host.json @@ -0,0 +1,16 @@ +{ + "extensionBundle": { + "id": "Microsoft.Azure.Functions.ExtensionBundle", + "version": "[4.*, 5.0.0)" + }, + "functionTimeout": "00:05:00", + "logging": { + "applicationInsights": { + "samplingSettings": { + "excludedTypes": "Request", + "isEnabled": true + } + } + }, + "version": "2.0" +} diff --git a/ai_search_with_adi/function_app/key_phrase_extraction.py b/ai_search_with_adi/function_app/key_phrase_extraction.py index 1de62bc..1b699e2 100644 --- a/ai_search_with_adi/function_app/key_phrase_extraction.py +++ b/ai_search_with_adi/function_app/key_phrase_extraction.py @@ -8,64 +8,77 @@ from azure.core.exceptions import HttpResponseError from azure.core.credentials import AzureKeyCredential import asyncio +from azure.identity import DefaultAzureCredential +from environment import IdentityType, get_identity_type MAX_TEXT_ELEMENTS = 5120 -def split_document(document, max_size): - """Split a document into chunks of max_size.""" +def split_document(document: str, max_size: int) -> list[str]: + """Split a document into chunks of max_size. + + Args: + document (str): The document to split. + max_size (int): The maximum size of each chunk.""" return [document[i : i + max_size] for i in range(0, len(document), max_size)] async def extract_key_phrases_from_text( - data: list[str], max_key_phrase_count: int + data: list[str], max_key_phrase_count: int, retries_left: int = 3 ) -> list[str]: + """Extract key phrases from the text. + + Args: + data (list[str]): The text data. + max_key_phrase_count (int): The maximum number of key phrases to return. + + Returns: + list[str]: The key phrases extracted from the text.""" logging.info("Python HTTP trigger function processed a request.") - max_retries = 5 key_phrase_list = [] + + if get_identity_type() == IdentityType.SYSTEM_ASSIGNED: + credential = DefaultAzureCredential() + elif get_identity_type() == IdentityType.USER_ASSIGNED: + credential = DefaultAzureCredential( + managed_identity_client_id=os.environ["FunctionApp__ClientId"] + ) + else: + credential = AzureKeyCredential(os.environ["AIService__Services__Key"]) text_analytics_client = TextAnalyticsClient( endpoint=os.environ["AIService__Services__Endpoint"], - credential=AzureKeyCredential(os.environ["AIService__Services__Key"]), + credential=credential, ) - try: - async with text_analytics_client: - retries = 0 - while retries < max_retries: - try: - # Split large documents - split_documents = [] - for doc in data: - if len(doc) > MAX_TEXT_ELEMENTS: - split_documents.extend( - split_document(doc, MAX_TEXT_ELEMENTS) - ) - else: - split_documents.append(doc) - result = await text_analytics_client.extract_key_phrases( - split_documents - ) - for idx, doc in enumerate(result): - if not doc.is_error: - key_phrase_list.extend( - doc.key_phrases[:max_key_phrase_count] - ) - else: - raise Exception(f"Document {idx} error: {doc.error}") - break # Exit the loop if the request is successful - except HttpResponseError as e: - if e.status_code == 429: # Rate limiting error - retries += 1 - wait_time = 2**retries # Exponential backoff - print( - f"Rate limit exceeded. Retrying in {wait_time} seconds..." - ) - await asyncio.sleep(wait_time) - else: - raise Exception(f"An error occurred: {e}") - except Exception as e: - raise Exception(f"An error occurred: {e}") + async with text_analytics_client: + try: + # Split large documents + split_documents = [] + for doc in data: + if len(doc) > MAX_TEXT_ELEMENTS: + split_documents.extend(split_document(doc, MAX_TEXT_ELEMENTS)) + else: + split_documents.append(doc) + + result = await text_analytics_client.extract_key_phrases(split_documents) + for idx, doc in enumerate(result): + if not doc.is_error: + key_phrase_list.extend(doc.key_phrases[:max_key_phrase_count]) + else: + raise Exception(f"Document {idx} error: {doc.error}") + except HttpResponseError as e: + if e.status_code == 429 and retries_left > 0: # Rate limiting error + wait_time = 2**retries_left # Exponential backoff + logging.info( + "%s Rate limit exceeded. Retrying in %s seconds...", e, wait_time + ) + await asyncio.sleep(wait_time) + return await extract_key_phrases_from_text( + data, max_key_phrase_count, retries_left - 1 + ) + else: + raise Exception(f"An error occurred: {e}") from e return key_phrase_list @@ -92,40 +105,26 @@ async def process_key_phrase_extraction( "errors": None, "warnings": None, } - extracted_record["data"]["keyPhrases"] = await extract_key_phrases_from_text( + extracted_record["data"]["key_phrases"] = await extract_key_phrases_from_text( [record["data"]["text"]], max_key_phrase_count ) - except Exception as e: - logging.error("key phrase extraction Error: %s", e) - await asyncio.sleep(10) - try: - extracted_record = { - "recordId": record["recordId"], - "data": {}, - "errors": None, - "warnings": None, - } - extracted_record["data"][ - "keyPhrases" - ] = await extract_key_phrases_from_text( - [record["data"]["text"]], max_key_phrase_count - ) - except Exception as inner_e: - logging.error("key phrase extraction Error: %s", inner_e) - logging.error( - "Failed to extract key phrase. Check function app logs for more details of exact failure." - ) - return { - "recordId": record["recordId"], - "data": {}, - "errors": [ - { - "message": "Failed to extract key phrase. Check function app logs for more details of exact failure." - } - ], - "warnings": None, - } - json_str = json.dumps(extracted_record, indent=4) - - logging.info(f"key phrase extraction output: {json_str}") - return extracted_record + except Exception as inner_e: + logging.error("key phrase extraction Error: %s", inner_e) + logging.error( + "Failed to extract key phrase. Check function app logs for more details of exact failure." + ) + return { + "recordId": record["recordId"], + "data": {}, + "errors": [ + { + "message": "Failed to extract key phrase. Check function app logs for more details of exact failure." + } + ], + "warnings": None, + } + else: + json_str = json.dumps(extracted_record, indent=4) + + logging.info(f"key phrase extraction output: {json_str}") + return extracted_record diff --git a/ai_search_with_adi/function_app/requirements.txt b/ai_search_with_adi/function_app/requirements.txt index adf82be..a923e7f 100644 --- a/ai_search_with_adi/function_app/requirements.txt +++ b/ai_search_with_adi/function_app/requirements.txt @@ -18,3 +18,4 @@ azure-ai-textanalytics azure-ai-vision-imageanalysis PyMuPDF aiohttp +Pillow diff --git a/ai_search_with_adi/function_app/storage_account.py b/ai_search_with_adi/function_app/storage_account.py index ecb4fea..bc70e74 100644 --- a/ai_search_with_adi/function_app/storage_account.py +++ b/ai_search_with_adi/function_app/storage_account.py @@ -10,18 +10,29 @@ class StorageAccountHelper: + """Helper class for interacting with Azure Blob Storage.""" + def __init__(self) -> None: + """Initialize the StorageAccountHelper class.""" self._client_id = os.environ["FunctionApp__ClientId"] - self._endpoint = os.environ["StorageAccount__Endpoint"] + self._endpoint = os.environ["StorageAccount__ConnectionString"] async def get_client(self): + """Get the BlobServiceClient object.""" credential = DefaultAzureCredential(managed_identity_client_id=self._client_id) return BlobServiceClient(account_url=self._endpoint, credential=credential) - async def add_metadata_to_blob(self, source: str, container: str, metadata) -> None: - """Add metadata to the business glossary blob.""" + async def add_metadata_to_blob( + self, source: str, container: str, metadata: dict + ) -> None: + """Add metadata to the blob. + + Args + source (str): The source of the blob. + container (str): The container of the blob. + metadata (dict): The metadata to add to the blob.""" blob = urllib.parse.unquote_plus(source) @@ -37,7 +48,12 @@ async def add_metadata_to_blob(self, source: str, container: str, metadata) -> N async def download_blob_to_temp_dir( self, source: str, container: str, target_file_name ) -> tuple[str, dict]: - """Download the business glossary file from the Azure Blob Storage.""" + """Download the file from the Azure Blob Storage. + + Args: + source (str): The source of the blob. + container (str): The container of the blob. + target_file_name (str): The target file name.""" blob = urllib.parse.unquote_plus(source) @@ -63,16 +79,3 @@ async def download_blob_to_temp_dir( temp_file.write(blob_contents) return temp_file_path, blob_properties.metadata - - async def upload_business_glossary_dataframe(self, df: str, sheet: str) -> str: - """Upload the business glossary dataframe to a JSONL file.""" - json_lines = df.to_json(orient="records", lines=True) - - container = os.environ["StorageAccount__BusinessGlossary__Container"] - blob = f"{sheet}.jsonl" - blob_service_client = await self.get_client() - async with blob_service_client: - async with blob_service_client.get_blob_client( - container=container, blob=blob - ) as blob_client: - await blob_client.upload_blob(json_lines, overwrite=True) From ecabdb226103809be2ea3e89cfa1098b153367c8 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Tue, 10 Sep 2024 17:14:56 +0100 Subject: [PATCH 17/33] Update readmes --- ai_search_with_adi/README.md | 134 +------------------- ai_search_with_adi/function_app/README.md | 144 ++++++++++++++++++++++ 2 files changed, 145 insertions(+), 133 deletions(-) diff --git a/ai_search_with_adi/README.md b/ai_search_with_adi/README.md index 6165ca4..6fe4aed 100644 --- a/ai_search_with_adi/README.md +++ b/ai_search_with_adi/README.md @@ -52,139 +52,7 @@ Deploy the associated function app and required resources. You can then experime To use with an index, either use the utility to configure a indexer in the provided form, or integrate the skill with your skillset pipeline. -### function_app.py - -`./function_apps/indexer/function_app.py` contains the HTTP entrypoints for the ADI skill and the other provided utility skills. - -### adi_2_aisearch - -`./function_apps/indexer/adi_2_aisearch.py` contains the methods for content extraction with ADI. The key methods are: - -#### analyse_document - -This method takes the passed file, uploads it to ADI and retrieves the Markdown format. - -#### process_figures_from_extracted_content - -This method takes the detected figures, and crops them out of the page to save them as images. It uses the `understand_image_with_vlm` to communicate with Azure OpenAI to understand the meaning of the extracted figure. - -`update_figure_description` is used to update the original Markdown content with the description and meaning of the figure. - -#### clean_adi_markdown - -This method performs the final cleaning of the Markdown contents. In this method, the section headings and page numbers are extracted for the content to be returned to the indexer. - -### Input Format - -The ADI Skill conforms to the [Azure AI Search Custom Skill Input Format](https://learn.microsoft.com/en-gb/azure/search/cognitive-search-custom-skill-web-api?WT.mc_id=Portal-Microsoft_Azure_Search#sample-input-json-structure). AI Search will automatically build this format if you use the utility file provided in this repo to build your indexer and skillset. - -```json -{ - "values": [ - { - "recordId": "0", - "data": { - "source": "" - } - }, - { - "recordId": "1", - "data": { - "source": "" - } - } - ] -} -``` - -### Output Format - -The ADI Skill conforms to the [Azure AI Search Custom Skill Output Format](https://learn.microsoft.com/en-gb/azure/search/cognitive-search-custom-skill-web-api?WT.mc_id=Portal-Microsoft_Azure_Search#sample-output-json-structure). - -If `chunk_by_page` header is `True` (recommended): - -```json -{ - "values": [ - { - "recordId": "0", - "data": { - "extracted_content": [ - { - "page_number": 1, - "sections": [ - "" - ], - "content": "" - }, - { - "page_number": 2, - "sections": [ - "" - ], - "content": "" - } - ] - } - }, - { - "recordId": "1", - "data": { - "extracted_content": [ - { - "page_number": 1, - "sections": [ - "" - ], - "content": "" - }, - { - "page_number": 2, - "sections": [ - "" - ], - "content": "" - } - ] - } - } - ] -} -``` - -If `chunk_by_page` header is `False`: - -```json -{ - "values": [ - { - "recordId": "0", - "data": { - "extracted_content": { - "sections": [ - "" - ], - "content": "" - } - } - }, - { - "recordId": "1", - "data": { - "extracted_content": { - "sections": [ - "" - ], - "content": "" - } - } - } - ] -} -``` - -**Page wise analysis in ADI is recommended to avoid splitting tables / figures across multiple chunks, when the chunking is performed.** - +Steps for deployment of the function app can be found in `./function_app/README.md`. ## Production Considerations diff --git a/ai_search_with_adi/function_app/README.md b/ai_search_with_adi/function_app/README.md index e69de29..ad4a6c0 100644 --- a/ai_search_with_adi/function_app/README.md +++ b/ai_search_with_adi/function_app/README.md @@ -0,0 +1,144 @@ +# AI Search Indexing with Azure Document Intelligence - Function App Setup + +The associated scripts in this portion of the repository contains the Azure Document Intelligence powered Function app. + +## Steps + +1. Update `.env` file with the associated values. Not all values are required dependent on whether you are using System / User Assigned Identities or a Key based authentication. Use this template to update the environment variables in the function app. +2. Make sure the infra and required identities are setup. This setup requires Azure Document Intelligence and GPT4o. +3. [Deploy your function app](https://learn.microsoft.com/en-us/azure/azure-functions/functions-deployment-technologies?tabs=windows) and test with a HTTP request. + +## Code Files + +### function_app.py + +`./indexer/function_app.py` contains the HTTP entrypoints for the ADI skill and the other provided utility skills. + +### adi_2_aisearch + +`./indexer/adi_2_aisearch.py` contains the methods for content extraction with ADI. The key methods are: + +#### analyse_document + +This method takes the passed file, uploads it to ADI and retrieves the Markdown format. + +#### process_figures_from_extracted_content + +This method takes the detected figures, and crops them out of the page to save them as images. It uses the `understand_image_with_vlm` to communicate with Azure OpenAI to understand the meaning of the extracted figure. + +`update_figure_description` is used to update the original Markdown content with the description and meaning of the figure. + +#### clean_adi_markdown + +This method performs the final cleaning of the Markdown contents. In this method, the section headings and page numbers are extracted for the content to be returned to the indexer. + +## Input Format + +The ADI Skill conforms to the [Azure AI Search Custom Skill Input Format](https://learn.microsoft.com/en-gb/azure/search/cognitive-search-custom-skill-web-api?WT.mc_id=Portal-Microsoft_Azure_Search#sample-input-json-structure). AI Search will automatically build this format if you use the utility file provided in this repo to build your indexer and skillset. + +```json +{ + "values": [ + { + "recordId": "0", + "data": { + "source": "" + } + }, + { + "recordId": "1", + "data": { + "source": "" + } + } + ] +} +``` + +## Output Format + +The ADI Skill conforms to the [Azure AI Search Custom Skill Output Format](https://learn.microsoft.com/en-gb/azure/search/cognitive-search-custom-skill-web-api?WT.mc_id=Portal-Microsoft_Azure_Search#sample-output-json-structure). + +If `chunk_by_page` header is `True` (recommended): + +```json +{ + "values": [ + { + "recordId": "0", + "data": { + "extracted_content": [ + { + "page_number": 1, + "sections": [ + "" + ], + "content": "" + }, + { + "page_number": 2, + "sections": [ + "" + ], + "content": "" + } + ] + } + }, + { + "recordId": "1", + "data": { + "extracted_content": [ + { + "page_number": 1, + "sections": [ + "" + ], + "content": "" + }, + { + "page_number": 2, + "sections": [ + "" + ], + "content": "" + } + ] + } + } + ] +} +``` + +If `chunk_by_page` header is `False`: + +```json +{ + "values": [ + { + "recordId": "0", + "data": { + "extracted_content": { + "sections": [ + "" + ], + "content": "" + } + } + }, + { + "recordId": "1", + "data": { + "extracted_content": { + "sections": [ + "" + ], + "content": "" + } + } + } + ] +} +``` + +**Page wise analysis in ADI is recommended to avoid splitting tables / figures across multiple chunks, when the chunking is performed.** From d433851ee9d603ce751773d72e7c7d7d7c367b13 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Tue, 10 Sep 2024 17:27:59 +0100 Subject: [PATCH 18/33] Restructure --- ai_search_with_adi/ai_search/environment.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ai_search_with_adi/ai_search/environment.py b/ai_search_with_adi/ai_search/environment.py index 035facf..3ded085 100644 --- a/ai_search_with_adi/ai_search/environment.py +++ b/ai_search_with_adi/ai_search/environment.py @@ -54,7 +54,7 @@ def identity_type(self) -> IdentityType: Returns: IdentityType: The identity type """ - identity = os.environ.get("IdentityType") + identity = os.environ.get("IdentityType").lower() if identity == "user_assigned": return IdentityType.USER_ASSIGNED @@ -231,7 +231,7 @@ def use_private_endpoint(self) -> bool: This function returns true if private endpoint is used """ return ( - os.environ.get("AIService__AzureSearchOptions__UsePrivateEndpoint") + os.environ.get("AIService__AzureSearchOptions__UsePrivateEndpoint").lower() == "true" ) From cdab428d75a50cde68818307bbf7f67ce890f5b7 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Tue, 10 Sep 2024 17:37:55 +0100 Subject: [PATCH 19/33] Update changes --- .vscode/tasks.json | 2 +- ai_search_with_adi/README.md | 4 ++-- .../{function_app => adi_function_app}/.funcignore | 0 .../{function_app => adi_function_app}/README.md | 2 +- .../adi_2_ai_search.py | 0 .../{function_app => adi_function_app}/environment.py | 0 .../{function_app => adi_function_app}/function_app.py | 0 .../{function_app => adi_function_app}/host.json | 0 .../key_phrase_extraction.py | 0 .../pre_embedding_cleaner.py | 0 .../{function_app => adi_function_app}/requirements.txt | 0 .../storage_account.py | 0 ai_search_with_adi/ai_search/ai_search.py | 7 +++++-- ai_search_with_adi/ai_search/deploy.py | 2 +- ai_search_with_adi/ai_search/environment.py | 7 +++++-- ai_search_with_adi/ai_search/rag_documents.py | 4 ++-- ai_search_with_adi/function_app/.env | 9 --------- 17 files changed, 17 insertions(+), 20 deletions(-) rename ai_search_with_adi/{function_app => adi_function_app}/.funcignore (100%) rename ai_search_with_adi/{function_app => adi_function_app}/README.md (97%) rename ai_search_with_adi/{function_app => adi_function_app}/adi_2_ai_search.py (100%) rename ai_search_with_adi/{function_app => adi_function_app}/environment.py (100%) rename ai_search_with_adi/{function_app => adi_function_app}/function_app.py (100%) rename ai_search_with_adi/{function_app => adi_function_app}/host.json (100%) rename ai_search_with_adi/{function_app => adi_function_app}/key_phrase_extraction.py (100%) rename ai_search_with_adi/{function_app => adi_function_app}/pre_embedding_cleaner.py (100%) rename ai_search_with_adi/{function_app => adi_function_app}/requirements.txt (100%) rename ai_search_with_adi/{function_app => adi_function_app}/storage_account.py (100%) delete mode 100644 ai_search_with_adi/function_app/.env diff --git a/.vscode/tasks.json b/.vscode/tasks.json index f9a5026..b4388cb 100644 --- a/.vscode/tasks.json +++ b/.vscode/tasks.json @@ -5,7 +5,7 @@ "isBackground": true, "label": "func: host start", "options": { - "cwd": "${workspaceFolder}/ai_search_with_adi/function_app" + "cwd": "${workspaceFolder}/ai_search_with_adi/adi_function_app" }, "problemMatcher": "$func-python-watch", "type": "func" diff --git a/ai_search_with_adi/README.md b/ai_search_with_adi/README.md index 6fe4aed..e1e2b47 100644 --- a/ai_search_with_adi/README.md +++ b/ai_search_with_adi/README.md @@ -39,7 +39,7 @@ The properties returned from the ADI Custom Skill are then used to perform the f ## Provided Notebooks \& Utilities - `./ai_search.py`, `./deploy.py` provide an easy Python based utility for deploying an index, indexer and corresponding skillset for AI Search. -- `./function_apps/indexer` provides a pre-built Python function app that communicates with Azure Document Intelligence, Azure OpenAI etc to perform the Markdown conversion, extraction of figures, figure understanding and corresponding cleaning of Markdown. +- `./adi_function_apps/indexer` provides a pre-built Python function app that communicates with Azure Document Intelligence, Azure OpenAI etc to perform the Markdown conversion, extraction of figures, figure understanding and corresponding cleaning of Markdown. - `./rag_with_ai_search.ipynb` provides example of how to utilise the AI Search plugin to query the index. ## Deploying AI Search Setup @@ -52,7 +52,7 @@ Deploy the associated function app and required resources. You can then experime To use with an index, either use the utility to configure a indexer in the provided form, or integrate the skill with your skillset pipeline. -Steps for deployment of the function app can be found in `./function_app/README.md`. +Steps for deployment of the function app can be found in `./adi_function_app/README.md`. ## Production Considerations diff --git a/ai_search_with_adi/function_app/.funcignore b/ai_search_with_adi/adi_function_app/.funcignore similarity index 100% rename from ai_search_with_adi/function_app/.funcignore rename to ai_search_with_adi/adi_function_app/.funcignore diff --git a/ai_search_with_adi/function_app/README.md b/ai_search_with_adi/adi_function_app/README.md similarity index 97% rename from ai_search_with_adi/function_app/README.md rename to ai_search_with_adi/adi_function_app/README.md index ad4a6c0..b718fd3 100644 --- a/ai_search_with_adi/function_app/README.md +++ b/ai_search_with_adi/adi_function_app/README.md @@ -12,7 +12,7 @@ The associated scripts in this portion of the repository contains the Azure Docu ### function_app.py -`./indexer/function_app.py` contains the HTTP entrypoints for the ADI skill and the other provided utility skills. +`./indexer/adi_function_app.py` contains the HTTP entrypoints for the ADI skill and the other provided utility skills. ### adi_2_aisearch diff --git a/ai_search_with_adi/function_app/adi_2_ai_search.py b/ai_search_with_adi/adi_function_app/adi_2_ai_search.py similarity index 100% rename from ai_search_with_adi/function_app/adi_2_ai_search.py rename to ai_search_with_adi/adi_function_app/adi_2_ai_search.py diff --git a/ai_search_with_adi/function_app/environment.py b/ai_search_with_adi/adi_function_app/environment.py similarity index 100% rename from ai_search_with_adi/function_app/environment.py rename to ai_search_with_adi/adi_function_app/environment.py diff --git a/ai_search_with_adi/function_app/function_app.py b/ai_search_with_adi/adi_function_app/function_app.py similarity index 100% rename from ai_search_with_adi/function_app/function_app.py rename to ai_search_with_adi/adi_function_app/function_app.py diff --git a/ai_search_with_adi/function_app/host.json b/ai_search_with_adi/adi_function_app/host.json similarity index 100% rename from ai_search_with_adi/function_app/host.json rename to ai_search_with_adi/adi_function_app/host.json diff --git a/ai_search_with_adi/function_app/key_phrase_extraction.py b/ai_search_with_adi/adi_function_app/key_phrase_extraction.py similarity index 100% rename from ai_search_with_adi/function_app/key_phrase_extraction.py rename to ai_search_with_adi/adi_function_app/key_phrase_extraction.py diff --git a/ai_search_with_adi/function_app/pre_embedding_cleaner.py b/ai_search_with_adi/adi_function_app/pre_embedding_cleaner.py similarity index 100% rename from ai_search_with_adi/function_app/pre_embedding_cleaner.py rename to ai_search_with_adi/adi_function_app/pre_embedding_cleaner.py diff --git a/ai_search_with_adi/function_app/requirements.txt b/ai_search_with_adi/adi_function_app/requirements.txt similarity index 100% rename from ai_search_with_adi/function_app/requirements.txt rename to ai_search_with_adi/adi_function_app/requirements.txt diff --git a/ai_search_with_adi/function_app/storage_account.py b/ai_search_with_adi/adi_function_app/storage_account.py similarity index 100% rename from ai_search_with_adi/function_app/storage_account.py rename to ai_search_with_adi/adi_function_app/storage_account.py diff --git a/ai_search_with_adi/ai_search/ai_search.py b/ai_search_with_adi/ai_search/ai_search.py index ee1fa99..c2646fc 100644 --- a/ai_search_with_adi/ai_search/ai_search.py +++ b/ai_search_with_adi/ai_search/ai_search.py @@ -28,7 +28,7 @@ ) from azure.core.exceptions import HttpResponseError from azure.search.documents.indexes import SearchIndexerClient, SearchIndexClient -from ai_search_with_adi.ai_search.environment import AISearchEnvironment, IdentityType +from environment import AISearchEnvironment, IdentityType class AISearch(ABC): @@ -45,7 +45,10 @@ def __init__( suffix (str, optional): The suffix for the indexer. Defaults to None. If an suffix is provided, it is assumed to be a test indexer. rebuild (bool, optional): Whether to rebuild the index. Defaults to False. """ - self.indexer_type = None + + if not hasattr(self, "indexer_type"): + self.indexer_type = None # Needed to help mypy understand that indexer_type is defined in the child class + raise ValueError("indexer_type is not defined in the child class.") if rebuild is not None: self.rebuild = rebuild diff --git a/ai_search_with_adi/ai_search/deploy.py b/ai_search_with_adi/ai_search/deploy.py index e28a61c..7254c03 100644 --- a/ai_search_with_adi/ai_search/deploy.py +++ b/ai_search_with_adi/ai_search/deploy.py @@ -1,7 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. import argparse -from ai_search_with_adi.ai_search.rag_documents import RagDocumentsAISearch +from rag_documents import RagDocumentsAISearch def deploy_config(arguments: argparse.Namespace): diff --git a/ai_search_with_adi/ai_search/environment.py b/ai_search_with_adi/ai_search/environment.py index 3ded085..ee5bd15 100644 --- a/ai_search_with_adi/ai_search/environment.py +++ b/ai_search_with_adi/ai_search/environment.py @@ -32,6 +32,7 @@ def __init__(self, indexer_type: IndexerType): indexer_type (IndexerType): The type of the indexer """ load_dotenv(find_dotenv()) + self.indexer_type = indexer_type @property @@ -41,6 +42,8 @@ def normalised_indexer_type(self) -> str: Returns: str: The normalised indexer type """ + print(self.indexer_type.value) + print(self.indexer_type) normalised_indexer_type = ( self.indexer_type.value.replace("-", " ").title().replace(" ", "") ) @@ -104,9 +107,9 @@ def ai_search_credential(self) -> DefaultAzureCredential | AzureKeyCredential: Returns: DefaultAzureCredential | AzureKeyCredential: The ai search credential """ - if self.identity_type in IdentityType.SYSTEM_ASSIGNED: + if self.identity_type == IdentityType.SYSTEM_ASSIGNED: return DefaultAzureCredential() - elif self.identity_type in IdentityType.USER_ASSIGNED: + elif self.identity_type == IdentityType.USER_ASSIGNED: return DefaultAzureCredential( managed_identity_client_id=self.ai_search_identity_id ) diff --git a/ai_search_with_adi/ai_search/rag_documents.py b/ai_search_with_adi/ai_search/rag_documents.py index 7c3184f..a0feffc 100644 --- a/ai_search_with_adi/ai_search/rag_documents.py +++ b/ai_search_with_adi/ai_search/rag_documents.py @@ -23,7 +23,7 @@ IndexerExecutionEnvironment, ) from ai_search import AISearch -from ai_search_with_adi.ai_search.environment import ( +from environment import ( IndexerType, ) @@ -43,9 +43,9 @@ def __init__( suffix (str, optional): The suffix for the indexer. Defaults to None. If an suffix is provided, it is assumed to be a test indexer. rebuild (bool, optional): Whether to rebuild the index. Defaults to False. """ + self.indexer_type = IndexerType.RAG_DOCUMENTS super().__init__(suffix, rebuild) - self.indexer_type = IndexerType.RAG_DOCUMENTS if enable_page_by_chunking is not None: self.enable_page_by_chunking = enable_page_by_chunking else: diff --git a/ai_search_with_adi/function_app/.env b/ai_search_with_adi/function_app/.env deleted file mode 100644 index fdf51e9..0000000 --- a/ai_search_with_adi/function_app/.env +++ /dev/null @@ -1,9 +0,0 @@ -AIService__Services__Endpoint= -AIService__Services__Key= -IdentityType= # system_assigned or user_assigned or key -StorageAccount__ConnectionString= -OpenAI__ApiKey= -OpenAI__Endpoint= -OpenAI__MultiModalDeployment= -OpenAI__ApiVersion= -FunctionApp__ClientId= From e40054f8b2fcd26925cf51d26d854fe4adbbbeb0 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Tue, 10 Sep 2024 17:50:36 +0100 Subject: [PATCH 20/33] Fix deployment bugs in indexer --- ai_search_with_adi/ai_search/ai_search.py | 7 ++++--- ai_search_with_adi/ai_search/environment.py | 3 +-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ai_search_with_adi/ai_search/ai_search.py b/ai_search_with_adi/ai_search/ai_search.py index c2646fc..4f4edb8 100644 --- a/ai_search_with_adi/ai_search/ai_search.py +++ b/ai_search_with_adi/ai_search/ai_search.py @@ -331,7 +331,7 @@ def get_vector_skill( InputFieldMappingEntry(name="text", source=source), ] embedding_skill_outputs = [ - OutputFieldMappingEntry(name="vector", target_name=target_name) + OutputFieldMappingEntry(name="embedding", target_name=target_name) ] vector_skill = AzureOpenAIEmbeddingSkill( @@ -340,6 +340,7 @@ def get_vector_skill( context=context, deployment_id=self.environment.open_ai_embedding_deployment, model_name=self.environment.open_ai_embedding_model, + resource_uri=self.environment.open_ai_endpoint, inputs=embedding_skill_inputs, outputs=embedding_skill_outputs, dimensions=self.environment.open_ai_embedding_dimensions, @@ -416,8 +417,8 @@ def get_vector_search(self) -> VectorSearch: open_ai_params = AzureOpenAIParameters( resource_uri=self.environment.open_ai_endpoint, - modelName=self.environment.open_ai_embedding_model, - deploymentId=self.environment.open_ai_embedding_deployment, + model_name=self.environment.open_ai_embedding_model, + deployment_id=self.environment.open_ai_embedding_deployment, ) if self.environment.identity_type == IdentityType.KEY: diff --git a/ai_search_with_adi/ai_search/environment.py b/ai_search_with_adi/ai_search/environment.py index ee5bd15..d7cbdb4 100644 --- a/ai_search_with_adi/ai_search/environment.py +++ b/ai_search_with_adi/ai_search/environment.py @@ -42,8 +42,7 @@ def normalised_indexer_type(self) -> str: Returns: str: The normalised indexer type """ - print(self.indexer_type.value) - print(self.indexer_type) + normalised_indexer_type = ( self.indexer_type.value.replace("-", " ").title().replace(" ", "") ) From a9386a94f5b1b7bea3f3ab45f23edbed1f054c1c Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Tue, 10 Sep 2024 18:01:52 +0100 Subject: [PATCH 21/33] Storage account code bug fix --- .../adi_function_app/storage_account.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/ai_search_with_adi/adi_function_app/storage_account.py b/ai_search_with_adi/adi_function_app/storage_account.py index bc70e74..7b0fd6b 100644 --- a/ai_search_with_adi/adi_function_app/storage_account.py +++ b/ai_search_with_adi/adi_function_app/storage_account.py @@ -7,6 +7,7 @@ from azure.storage.blob.aio import BlobServiceClient from azure.identity import DefaultAzureCredential import urllib +from environment import IdentityType, get_identity_type class StorageAccountHelper: @@ -14,15 +15,20 @@ class StorageAccountHelper: def __init__(self) -> None: """Initialize the StorageAccountHelper class.""" - self._client_id = os.environ["FunctionApp__ClientId"] - self._endpoint = os.environ["StorageAccount__ConnectionString"] async def get_client(self): """Get the BlobServiceClient object.""" - credential = DefaultAzureCredential(managed_identity_client_id=self._client_id) - - return BlobServiceClient(account_url=self._endpoint, credential=credential) + if get_identity_type() == IdentityType.SYSTEM_ASSIGNED: + credential = DefaultAzureCredential() + return BlobServiceClient(account_url=self._endpoint, credential=credential) + elif get_identity_type() == IdentityType.USER_ASSIGNED: + credential = DefaultAzureCredential( + managed_identity_client_id=os.environ["FunctionApp__ClientId"] + ) + return BlobServiceClient(account_url=self._endpoint, credential=credential) + else: + return BlobServiceClient(account_url=self._endpoint) async def add_metadata_to_blob( self, source: str, container: str, metadata: dict From 0634e24c1d5361bd93f04f29af547954ba4eaeba Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Tue, 10 Sep 2024 18:09:38 +0100 Subject: [PATCH 22/33] Further bug fixes --- .../adi_function_app/adi_2_ai_search.py | 2 +- .../adi_function_app/key_phrase_extraction.py | 6 +++--- .../adi_function_app/storage_account.py | 15 +++++++-------- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/ai_search_with_adi/adi_function_app/adi_2_ai_search.py b/ai_search_with_adi/adi_function_app/adi_2_ai_search.py index bb41f25..d600ac9 100644 --- a/ai_search_with_adi/adi_function_app/adi_2_ai_search.py +++ b/ai_search_with_adi/adi_function_app/adi_2_ai_search.py @@ -146,7 +146,7 @@ async def understand_image_with_gptv(image_base64, caption, tries_left=3): elif get_identity_type() != IdentityType.USER_ASSIGNED: token_provider = get_bearer_token_provider( DefaultAzureCredential( - managed_identity_client_id=os.environ["FunctionApp__ClientId"] + managed_identity_client_id=os.environ.get("FunctionApp__ClientId") ), "https://cognitiveservices.azure.com/.default", ) diff --git a/ai_search_with_adi/adi_function_app/key_phrase_extraction.py b/ai_search_with_adi/adi_function_app/key_phrase_extraction.py index 1b699e2..c93d62a 100644 --- a/ai_search_with_adi/adi_function_app/key_phrase_extraction.py +++ b/ai_search_with_adi/adi_function_app/key_phrase_extraction.py @@ -42,12 +42,12 @@ async def extract_key_phrases_from_text( credential = DefaultAzureCredential() elif get_identity_type() == IdentityType.USER_ASSIGNED: credential = DefaultAzureCredential( - managed_identity_client_id=os.environ["FunctionApp__ClientId"] + managed_identity_client_id=os.environ.get("FunctionApp__ClientId") ) else: - credential = AzureKeyCredential(os.environ["AIService__Services__Key"]) + credential = AzureKeyCredential(os.environ.get("AIService__Services__Key")) text_analytics_client = TextAnalyticsClient( - endpoint=os.environ["AIService__Services__Endpoint"], + endpoint=os.environ.get("AIService__Services__Endpoint"), credential=credential, ) diff --git a/ai_search_with_adi/adi_function_app/storage_account.py b/ai_search_with_adi/adi_function_app/storage_account.py index 7b0fd6b..8c5aa98 100644 --- a/ai_search_with_adi/adi_function_app/storage_account.py +++ b/ai_search_with_adi/adi_function_app/storage_account.py @@ -13,22 +13,21 @@ class StorageAccountHelper: """Helper class for interacting with Azure Blob Storage.""" - def __init__(self) -> None: - """Initialize the StorageAccountHelper class.""" - self._endpoint = os.environ["StorageAccount__ConnectionString"] - async def get_client(self): """Get the BlobServiceClient object.""" if get_identity_type() == IdentityType.SYSTEM_ASSIGNED: + endpoint = os.environ.get("StorageAccount__Endpoint") credential = DefaultAzureCredential() - return BlobServiceClient(account_url=self._endpoint, credential=credential) + return BlobServiceClient(account_url=endpoint, credential=credential) elif get_identity_type() == IdentityType.USER_ASSIGNED: + endpoint = os.environ.get("StorageAccount__Endpoint") credential = DefaultAzureCredential( - managed_identity_client_id=os.environ["FunctionApp__ClientId"] + managed_identity_client_id=os.environ.get("FunctionApp__ClientId") ) - return BlobServiceClient(account_url=self._endpoint, credential=credential) + return BlobServiceClient(account_url=endpoint, credential=credential) else: - return BlobServiceClient(account_url=self._endpoint) + endpoint = os.environ.get("StorageAccount__ConnectionString") + return BlobServiceClient(account_url=endpoint) async def add_metadata_to_blob( self, source: str, container: str, metadata: dict From c94ef0878fa890508322f0967764b1d262756de4 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Tue, 10 Sep 2024 18:41:32 +0100 Subject: [PATCH 23/33] Bug fix adi code --- .../adi_function_app/adi_2_ai_search.py | 35 +++++++++++++------ .../adi_function_app/function_app.py | 2 +- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/ai_search_with_adi/adi_function_app/adi_2_ai_search.py b/ai_search_with_adi/adi_function_app/adi_2_ai_search.py index d600ac9..39d65f8 100644 --- a/ai_search_with_adi/adi_function_app/adi_2_ai_search.py +++ b/ai_search_with_adi/adi_function_app/adi_2_ai_search.py @@ -32,6 +32,9 @@ def crop_image_from_pdf_page(pdf_path, page_number, bounding_box): doc = fitz.open(pdf_path) page = doc.load_page(page_number) + logging.debug(f"Bounding Box: {bounding_box}") + logging.debug(f"Page Number: {page_number}") + # Cropping the page. The rect requires the coordinates in the format (x0, y0, x1, y1). bbx = [x * 72 for x in bounding_box] rect = fitz.Rect(bbx) @@ -39,6 +42,10 @@ def crop_image_from_pdf_page(pdf_path, page_number, bounding_box): img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) + if pix.width == 0 or pix.height == 0: + logging.error("Cropped image has 0 width or height.") + return None + doc.close() return img @@ -160,7 +167,7 @@ async def understand_image_with_gptv(image_base64, caption, tries_left=3): api_key=api_key, api_version=api_version, azure_ad_token_provider=token_provider, - azure_endpoint=os.environ.get("OpenAI__AzureEndpoint"), + azure_endpoint=os.environ.get("OpenAI__Endpoint"), ) as client: # We send both image caption and the image body to GPTv for better understanding if caption != "": @@ -179,8 +186,10 @@ async def understand_image_with_gptv(image_base64, caption, tries_left=3): "text": f"Describe this image with technical analysis. Provide a well-structured, description. IMPORTANT: If the provided image is a logo or photograph, simply return 'Irrelevant Image'. (note: it has image caption: {caption}):", }, { - "type": "image_base64", - "image_base64": {"image": image_base64}, + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{image_base64}" + }, }, ], }, @@ -204,8 +213,10 @@ async def understand_image_with_gptv(image_base64, caption, tries_left=3): "text": "Describe this image with technical analysis. Provide a well-structured, description. IMPORTANT: If the provided image is a logo or photograph, simply return 'Irrelevant Image'.", }, { - "type": "image_base64", - "image_base64": {"image": image_base64}, + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{image_base64}" + }, }, ], }, @@ -294,12 +305,16 @@ async def process_figures_from_extracted_content( file_path, region.page_number - 1, bounding_box ) # page_number is 1-indexed3 - image_base64 = pil_image_to_base64(cropped_image) + if cropped_image is None: + img_description += "Irrelevant Image" + else: + image_base64 = pil_image_to_base64(cropped_image) - img_description += await understand_image_with_gptv( - image_base64, figure.caption.content - ) - logging.info(f"\tDescription of figure {idx}: {img_description}") + img_description = await understand_image_with_gptv( + image_base64, figure.caption.content + ) + logging.info(f"\tDescription of figure {idx}: {img_description}") + break markdown_content = update_figure_description( markdown_content, img_description, idx diff --git a/ai_search_with_adi/adi_function_app/function_app.py b/ai_search_with_adi/adi_function_app/function_app.py index 52cc66e..cca6005 100644 --- a/ai_search_with_adi/adi_function_app/function_app.py +++ b/ai_search_with_adi/adi_function_app/function_app.py @@ -9,7 +9,7 @@ from pre_embedding_cleaner import process_pre_embedding_cleaner from key_phrase_extraction import process_key_phrase_extraction -logging.basicConfig(level=logging.INFO) +logging.basicConfig(level=logging.DEBUG) app = func.FunctionApp(http_auth_level=func.AuthLevel.FUNCTION) From 23ec0297fef2908ba5c9cfedf8a74a9074fb9e3c Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Tue, 10 Sep 2024 18:53:55 +0100 Subject: [PATCH 24/33] Update prompt --- .../adi_function_app/adi_2_ai_search.py | 95 ++++++++----------- 1 file changed, 41 insertions(+), 54 deletions(-) diff --git a/ai_search_with_adi/adi_function_app/adi_2_ai_search.py b/ai_search_with_adi/adi_function_app/adi_2_ai_search.py index 39d65f8..1fe225a 100644 --- a/ai_search_with_adi/adi_function_app/adi_2_ai_search.py +++ b/ai_search_with_adi/adi_function_app/adi_2_ai_search.py @@ -66,10 +66,10 @@ def clean_adi_markdown( """ output_dict = {} - comment_patterns = r"||" + comment_patterns = r"|||" cleaned_text = re.sub(comment_patterns, "", markdown_text, flags=re.DOTALL) - combined_pattern = r"(.*?)\n===|\n## ?(.*?)\n|\n### ?(.*?)\n" + combined_pattern = r"(.*?)\n===|\n# (.*?)\n|\n## ?(.*?)\n|\n### ?(.*?)\n|\n#### ?(.*?)\n|\n##### ?(.*?)\n|\n###### ?(.*?)\n" doc_metadata = re.findall(combined_pattern, cleaned_text, re.DOTALL) doc_metadata = [match for group in doc_metadata for match in group if match] @@ -162,6 +162,21 @@ async def understand_image_with_gptv(image_base64, caption, tries_left=3): token_provider = None api_key = os.environ.get("OpenAI__ApiKey") + system_prompt = """You are an expert in image analysis. Use your experience and skills to provided a detailed description of any provided images. You should FOCUS on what info can be inferred from the image and the meaning of the data inside the image. Draw actionable insights and conclusions from the image. + + If the image is a chart for instance, you should describe the data trends, patterns, and insights that can be drawn from the chart. + + If the image is a map, you should describe the geographical features, landmarks, and any other relevant information that can be inferred from the map. + + If the image is a diagram, you should describe the components, relationships, and any other relevant information that can be inferred from the diagram. + + IMPORTANT: If the provided image is a logo or photograph, simply return 'Irrelevant Image'.""" + + user_input = "Describe this image with technical analysis. Provide a well-structured, description." + + if caption != "": + user_input += f" (note: it has image caption: {caption})" + try: async with AsyncAzureOpenAI( api_key=api_key, @@ -170,59 +185,31 @@ async def understand_image_with_gptv(image_base64, caption, tries_left=3): azure_endpoint=os.environ.get("OpenAI__Endpoint"), ) as client: # We send both image caption and the image body to GPTv for better understanding - if caption != "": - response = await client.chat.completions.create( - model=model, - messages=[ - { - "role": "system", - "content": "You are an expert in image analysis. Use your experience and skills to provided a detailed description of any provided images. You should focus on what info can be inferred from the image and the meaning of the data inside the image.", - }, - { - "role": "user", - "content": [ - { - "type": "text", - "text": f"Describe this image with technical analysis. Provide a well-structured, description. IMPORTANT: If the provided image is a logo or photograph, simply return 'Irrelevant Image'. (note: it has image caption: {caption}):", - }, - { - "type": "image_url", - "image_url": { - "url": f"data:image/png;base64,{image_base64}" - }, - }, - ], - }, - ], - max_tokens=MAX_TOKENS, - ) - - else: - response = await client.chat.completions.create( - model=model, - messages=[ - { - "role": "system", - "content": "You are an expert in image analysis. Use your experience and skills to provided a detailed description of any provided images. You should focus on what info can be inferred from the image and the meaning of the data inside the image.", - }, - { - "role": "user", - "content": [ - { - "type": "text", - "text": "Describe this image with technical analysis. Provide a well-structured, description. IMPORTANT: If the provided image is a logo or photograph, simply return 'Irrelevant Image'.", - }, - { - "type": "image_url", - "image_url": { - "url": f"data:image/png;base64,{image_base64}" - }, + response = await client.chat.completions.create( + model=model, + messages=[ + { + "role": "system", + "content": system_prompt, + }, + { + "role": "user", + "content": [ + { + "type": "text", + "text": user_input, + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{image_base64}" }, - ], - }, - ], - max_tokens=MAX_TOKENS, - ) + }, + ], + }, + ], + max_tokens=MAX_TOKENS, + ) img_description = response.choices[0].message.content From bde3d6dc94d9db5ef5990bae9338bb21a794ac76 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Tue, 10 Sep 2024 19:21:25 +0100 Subject: [PATCH 25/33] Fix section bugs --- .../adi_function_app/adi_2_ai_search.py | 19 ++++++++++++---- .../adi_function_app/pre_embedding_cleaner.py | 22 +++++++++++++------ ai_search_with_adi/ai_search/ai_search.py | 18 +++++---------- ai_search_with_adi/ai_search/rag_documents.py | 2 +- 4 files changed, 37 insertions(+), 24 deletions(-) diff --git a/ai_search_with_adi/adi_function_app/adi_2_ai_search.py b/ai_search_with_adi/adi_function_app/adi_2_ai_search.py index 1fe225a..da54d3f 100644 --- a/ai_search_with_adi/adi_function_app/adi_2_ai_search.py +++ b/ai_search_with_adi/adi_function_app/adi_2_ai_search.py @@ -69,7 +69,7 @@ def clean_adi_markdown( comment_patterns = r"|||" cleaned_text = re.sub(comment_patterns, "", markdown_text, flags=re.DOTALL) - combined_pattern = r"(.*?)\n===|\n# (.*?)\n|\n## ?(.*?)\n|\n### ?(.*?)\n|\n#### ?(.*?)\n|\n##### ?(.*?)\n|\n###### ?(.*?)\n" + combined_pattern = r"(.*?)\n===|\n#+\s*(.*?)\n" doc_metadata = re.findall(combined_pattern, cleaned_text, re.DOTALL) doc_metadata = [match for group in doc_metadata for match in group if match] @@ -170,6 +170,8 @@ async def understand_image_with_gptv(image_base64, caption, tries_left=3): If the image is a diagram, you should describe the components, relationships, and any other relevant information that can be inferred from the diagram. + Include any data points, labels, and other relevant information that can be inferred from the image. + IMPORTANT: If the provided image is a logo or photograph, simply return 'Irrelevant Image'.""" user_input = "Describe this image with technical analysis. Provide a well-structured, description." @@ -255,6 +257,10 @@ def pil_image_to_base64(image, image_format="JPEG"): return base64.b64encode(buffered.getvalue()).decode("utf-8") +async def mark_image_as_irrelevant(): + return "Irrelevant Image" + + async def process_figures_from_extracted_content( file_path: str, markdown_content: str, figures: list, page_number: None | int = None ) -> str: @@ -270,6 +276,8 @@ async def process_figures_from_extracted_content( Returns: -------- str: The updated Markdown content with the figure descriptions.""" + + image_understanding_tasks = [] for idx, figure in enumerate(figures): img_description = "" logging.debug(f"Figure #{idx} has the following spans: {figure.spans}") @@ -293,16 +301,19 @@ async def process_figures_from_extracted_content( ) # page_number is 1-indexed3 if cropped_image is None: - img_description += "Irrelevant Image" + image_understanding_tasks.append(mark_image_as_irrelevant()) else: image_base64 = pil_image_to_base64(cropped_image) - img_description = await understand_image_with_gptv( - image_base64, figure.caption.content + image_understanding_tasks.append( + understand_image_with_gptv(image_base64, figure.caption.content) ) logging.info(f"\tDescription of figure {idx}: {img_description}") break + image_descriptions = await asyncio.gather(*image_understanding_tasks) + + for idx, img_description in enumerate(image_descriptions): markdown_content = update_figure_description( markdown_content, img_description, idx ) diff --git a/ai_search_with_adi/adi_function_app/pre_embedding_cleaner.py b/ai_search_with_adi/adi_function_app/pre_embedding_cleaner.py index b3303d6..5c81d7a 100644 --- a/ai_search_with_adi/adi_function_app/pre_embedding_cleaner.py +++ b/ai_search_with_adi/adi_function_app/pre_embedding_cleaner.py @@ -21,10 +21,17 @@ def get_section(cleaned_text: str) -> list: list: The sections related to text """ - combined_pattern = r"(.*?)\n===|\n## ?(.*?)\n|\n### ?(.*?)\n" + combined_pattern = r"(.*?)\n===|\n#+\s*(.*?)\n" doc_metadata = re.findall(combined_pattern, cleaned_text, re.DOTALL) doc_metadata = [match for group in doc_metadata for match in group if match] - return doc_metadata + return clean_sections(doc_metadata) + + +def clean_sections(sections: list) -> list: + """Cleans the sections by removing special characters and extra white spaces.""" + cleaned_sections = [re.sub(r"[=#]", "", match).strip() for match in sections] + + return cleaned_sections def remove_markdown_tags(text: str, tag_patterns: dict) -> str: @@ -120,16 +127,17 @@ async def process_pre_embedding_cleaner(record: dict) -> dict: record["data"]["chunk"]["content"] ) cleaned_record["data"]["chunk"] = record["data"]["chunk"]["content"] - cleaned_record["data"]["section"] = record["data"]["chunk"]["section"] - cleaned_record["data"]["page_number"] = record["data"]["chunk"][ - "page_number" - ] + cleaned_record["data"]["sections"] = clean_sections( + record["data"]["chunk"]["sections"] + ) else: cleaned_record["data"]["cleaned_chunk"] = clean_text( record["data"]["chunk"] ) cleaned_record["data"]["chunk"] = record["data"]["chunk"] - cleaned_record["data"]["section"] = get_section(record["data"]["chunk"]) + cleaned_record["data"]["cleaned_sections"] = get_section( + record["data"]["chunk"] + ) except Exception as e: logging.error("string cleanup Error: %s", e) diff --git a/ai_search_with_adi/ai_search/ai_search.py b/ai_search_with_adi/ai_search/ai_search.py index 4f4edb8..222ee66 100644 --- a/ai_search_with_adi/ai_search/ai_search.py +++ b/ai_search_with_adi/ai_search/ai_search.py @@ -175,7 +175,7 @@ def get_data_source(self) -> SearchIndexerDataSourceConnection: return data_source_connection def get_pre_embedding_cleaner_skill( - self, context, source, chunk_by_page=False, target_name="cleaned_chunk" + self, context, source, target_name="cleaned_chunk" ) -> WebApiSkill: """Get the custom skill for data cleanup. @@ -203,18 +203,11 @@ def get_pre_embedding_cleaner_skill( pre_embedding_cleaner_skill_outputs = [ OutputFieldMappingEntry(name="cleaned_chunk", target_name=target_name), OutputFieldMappingEntry(name="chunk", target_name="chunk"), - OutputFieldMappingEntry(name="section", target_name="section"), + OutputFieldMappingEntry( + name="cleaned_sections", target_name="cleaned_sections" + ), ] - if chunk_by_page: - pre_embedding_cleaner_skill_outputs.extend( - [ - OutputFieldMappingEntry( - name="page_number", target_name="page_number" - ), - ] - ) - pre_embedding_cleaner_skill = WebApiSkill( name="Pre Embedding Cleaner Skill", description="Skill to clean the data before sending to embedding", @@ -277,8 +270,9 @@ def get_adi_skill(self, chunk_by_page=False) -> WebApiSkill: batch_size = 1 degree_of_parallelism = 4 else: + # Depending on your GPT Token limit, you may need to adjust the batch size and degree of parallelism batch_size = 1 - degree_of_parallelism = 16 + degree_of_parallelism = 8 if chunk_by_page: output = [ diff --git a/ai_search_with_adi/ai_search/rag_documents.py b/ai_search_with_adi/ai_search/rag_documents.py index a0feffc..1dadc06 100644 --- a/ai_search_with_adi/ai_search/rag_documents.py +++ b/ai_search_with_adi/ai_search/rag_documents.py @@ -191,7 +191,7 @@ def get_index_projections(self) -> SearchIndexerIndexProjections: name="Keywords", source="/document/pages/*/keywords" ), InputFieldMappingEntry( - name="Sections", source="/document/pages/*/sections" + name="Sections", source="/document/pages/*/cleaned_sections" ), ] From 4ecf2c8b717e0f3b9ed5cdaee5284e930276d43c Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Tue, 10 Sep 2024 19:46:42 +0100 Subject: [PATCH 26/33] More bug fixes --- ai_search_with_adi/adi_function_app/adi_2_ai_search.py | 6 +++--- .../adi_function_app/pre_embedding_cleaner.py | 2 +- ai_search_with_adi/ai_search/rag_documents.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ai_search_with_adi/adi_function_app/adi_2_ai_search.py b/ai_search_with_adi/adi_function_app/adi_2_ai_search.py index da54d3f..b7609af 100644 --- a/ai_search_with_adi/adi_function_app/adi_2_ai_search.py +++ b/ai_search_with_adi/adi_function_app/adi_2_ai_search.py @@ -66,7 +66,7 @@ def clean_adi_markdown( """ output_dict = {} - comment_patterns = r"|||" + comment_patterns = r"|||" cleaned_text = re.sub(comment_patterns, "", markdown_text, flags=re.DOTALL) combined_pattern = r"(.*?)\n===|\n#+\s*(.*?)\n" @@ -82,8 +82,6 @@ def clean_adi_markdown( irrelevant_figure_pattern, "", cleaned_text, flags=re.DOTALL ) - # Replace ':selected:' with a new line - cleaned_text = re.sub(r":(selected|unselected):", "\n", cleaned_text) output_dict["content"] = cleaned_text output_dict["sections"] = doc_metadata @@ -313,6 +311,8 @@ async def process_figures_from_extracted_content( image_descriptions = await asyncio.gather(*image_understanding_tasks) + logging.info(f"Image Descriptions: {image_descriptions}") + for idx, img_description in enumerate(image_descriptions): markdown_content = update_figure_description( markdown_content, img_description, idx diff --git a/ai_search_with_adi/adi_function_app/pre_embedding_cleaner.py b/ai_search_with_adi/adi_function_app/pre_embedding_cleaner.py index 5c81d7a..005954e 100644 --- a/ai_search_with_adi/adi_function_app/pre_embedding_cleaner.py +++ b/ai_search_with_adi/adi_function_app/pre_embedding_cleaner.py @@ -127,7 +127,7 @@ async def process_pre_embedding_cleaner(record: dict) -> dict: record["data"]["chunk"]["content"] ) cleaned_record["data"]["chunk"] = record["data"]["chunk"]["content"] - cleaned_record["data"]["sections"] = clean_sections( + cleaned_record["data"]["cleaned_sections"] = clean_sections( record["data"]["chunk"]["sections"] ) else: diff --git a/ai_search_with_adi/ai_search/rag_documents.py b/ai_search_with_adi/ai_search/rag_documents.py index 1dadc06..2e1121a 100644 --- a/ai_search_with_adi/ai_search/rag_documents.py +++ b/ai_search_with_adi/ai_search/rag_documents.py @@ -148,7 +148,7 @@ def get_skills(self) -> list: ) pre_embedding_cleaner_skill = self.get_pre_embedding_cleaner_skill( - "/document/pages/*", "/document/pages/*", self.enable_page_by_chunking + "/document/pages/*", "/document/pages/*" ) key_phrase_extraction_skill = self.get_key_phrase_extraction_skill( @@ -248,7 +248,7 @@ def get_indexer(self) -> SearchIndexer: fail_on_unprocessable_document=False, fail_on_unsupported_content_type=False, index_storage_metadata_only_for_oversized_documents=True, - indexed_file_name_extensions=".pdf,.pptx,.docx,.xlsx,.txt", + indexed_file_name_extensions=".pdf,.pptx,.docx,.xlsx,.txt,.png,.jpg,.jpeg", ), max_failed_items=5, ) From 3928edb65850f17fa4205b761218be0b880705d2 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Tue, 10 Sep 2024 20:09:27 +0100 Subject: [PATCH 27/33] Handle new way of finding figures --- .../adi_function_app/adi_2_ai_search.py | 60 +++++++++++-------- 1 file changed, 35 insertions(+), 25 deletions(-) diff --git a/ai_search_with_adi/adi_function_app/adi_2_ai_search.py b/ai_search_with_adi/adi_function_app/adi_2_ai_search.py index b7609af..7872470 100644 --- a/ai_search_with_adi/adi_function_app/adi_2_ai_search.py +++ b/ai_search_with_adi/adi_function_app/adi_2_ai_search.py @@ -92,7 +92,7 @@ def clean_adi_markdown( return output_dict -def update_figure_description(md_content, img_description, idx): +def update_figure_description(md_content, img_description, offset, length): """ Updates the figure description in the Markdown content. @@ -105,26 +105,20 @@ def update_figure_description(md_content, img_description, idx): str: The updated Markdown content with the new figure description. """ - # The substring you're looking for - start_substring = f"![](figures/{idx})" - end_substring = "" + # Define the new string to replace the old content new_string = f'' - new_md_content = md_content - # Find the start and end indices of the part to replace - start_index = md_content.find(start_substring) - if start_index != -1: # if start_substring is found - start_index += len( - start_substring - ) # move the index to the end of start_substring - end_index = md_content.find(end_substring, start_index) - if end_index != -1: # if end_substring is found - # Replace the old string with the new string - new_md_content = ( - md_content[:start_index] + new_string + md_content[end_index:] - ) + # Calculate the end index of the content to be replaced + end_index = offset + length + + # Ensure that the end_index does not exceed the length of the Markdown content + if end_index > len(md_content): + end_index = len(md_content) + + # Replace the old string with the new string + new_md_content = md_content[:offset] + new_string + md_content[end_index:] - return new_md_content + return new_md_content, len(new_string) async def understand_image_with_gptv(image_base64, caption, tries_left=3): @@ -260,7 +254,11 @@ async def mark_image_as_irrelevant(): async def process_figures_from_extracted_content( - file_path: str, markdown_content: str, figures: list, page_number: None | int = None + file_path: str, + markdown_content: str, + figures: list, + page_number: None | int = None, + page_offset: int = 0, ) -> str: """Process the figures extracted from the content using ADI and send them for analysis. @@ -270,6 +268,7 @@ async def process_figures_from_extracted_content( markdown_content (str): The extracted content in Markdown format. figures (list): The list of figures extracted by the Azure Document Intelligence service. page_number (int): The page number to process. If None, all pages are processed. + page_offset (int): The offset of the page. Returns: -------- @@ -313,10 +312,14 @@ async def process_figures_from_extracted_content( logging.info(f"Image Descriptions: {image_descriptions}") - for idx, img_description in enumerate(image_descriptions): - markdown_content = update_figure_description( - markdown_content, img_description, idx + running_offset = 0 + for idx, figure in enumerate(figures): + img_description = image_descriptions[idx] + starting_offset = figure.spans[0].offset + running_offset - page_offset + markdown_content, desc_offset = update_figure_description( + markdown_content, img_description, starting_offset, figure.spans[0].length ) + running_offset += desc_offset return markdown_content @@ -335,6 +338,7 @@ def create_page_wise_content(result: AnalyzeResult) -> list: page_wise_content = [] page_numbers = [] + page_offsets = [] for page_number, page in enumerate(result.pages): page_content = result.content[ @@ -342,6 +346,7 @@ def create_page_wise_content(result: AnalyzeResult) -> list: ] page_wise_content.append(page_content) page_numbers.append(page_number) + page_offsets.append(page.spans[0]["offset"]) return page_wise_content, page_numbers @@ -496,15 +501,20 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) -> try: if chunk_by_page: cleaned_result = [] - markdown_content, page_numbers = create_page_wise_content(result) + markdown_content, page_numbers, page_offsets = create_page_wise_content( + result + ) content_with_figures_tasks = [ process_figures_from_extracted_content( temp_file_path, page_content, result.figures, page_number=page_number, + page_offset=page_offset, + ) + for page_content, page_number, page_offset in zip( + markdown_content, page_numbers, page_offsets ) - for page_content, page_number in zip(markdown_content, page_numbers) ] content_with_figures = await asyncio.gather(*content_with_figures_tasks) @@ -523,7 +533,7 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) -> else: markdown_content = result.content content_with_figures = await process_figures_from_extracted_content( - temp_file_path, markdown_content, result.figures + temp_file_path, markdown_content, result.figures, page_offset=0 ) cleaned_result = clean_adi_markdown( content_with_figures, remove_irrelevant_figures=False From 6d7ca8645fc90cb0f76d17c508e79f0091d919c7 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Tue, 10 Sep 2024 20:12:00 +0100 Subject: [PATCH 28/33] return offsets --- ai_search_with_adi/adi_function_app/adi_2_ai_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ai_search_with_adi/adi_function_app/adi_2_ai_search.py b/ai_search_with_adi/adi_function_app/adi_2_ai_search.py index 7872470..aa2687b 100644 --- a/ai_search_with_adi/adi_function_app/adi_2_ai_search.py +++ b/ai_search_with_adi/adi_function_app/adi_2_ai_search.py @@ -348,7 +348,7 @@ def create_page_wise_content(result: AnalyzeResult) -> list: page_numbers.append(page_number) page_offsets.append(page.spans[0]["offset"]) - return page_wise_content, page_numbers + return page_wise_content, page_numbers, page_offsets async def analyse_document(file_path: str) -> AnalyzeResult: From 41a42379f6db9033d52d39262a867855cdac3cd8 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Tue, 10 Sep 2024 20:47:49 +0100 Subject: [PATCH 29/33] Update code --- .../adi_function_app/adi_2_ai_search.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/ai_search_with_adi/adi_function_app/adi_2_ai_search.py b/ai_search_with_adi/adi_function_app/adi_2_ai_search.py index aa2687b..d54468b 100644 --- a/ai_search_with_adi/adi_function_app/adi_2_ai_search.py +++ b/ai_search_with_adi/adi_function_app/adi_2_ai_search.py @@ -274,6 +274,7 @@ async def process_figures_from_extracted_content( -------- str: The updated Markdown content with the figure descriptions.""" + figure_spans = [] image_understanding_tasks = [] for idx, figure in enumerate(figures): img_description = "" @@ -297,6 +298,8 @@ async def process_figures_from_extracted_content( file_path, region.page_number - 1, bounding_box ) # page_number is 1-indexed3 + figure_spans.append(figure.spans[0]) + if cropped_image is None: image_understanding_tasks.append(mark_image_as_irrelevant()) else: @@ -313,11 +316,10 @@ async def process_figures_from_extracted_content( logging.info(f"Image Descriptions: {image_descriptions}") running_offset = 0 - for idx, figure in enumerate(figures): - img_description = image_descriptions[idx] - starting_offset = figure.spans[0].offset + running_offset - page_offset + for figure_span, image_description in zip(figure_spans, image_descriptions): + starting_offset = figure_span.offset + running_offset - page_offset markdown_content, desc_offset = update_figure_description( - markdown_content, img_description, starting_offset, figure.spans[0].length + markdown_content, image_description, starting_offset, figure_span.length ) running_offset += desc_offset @@ -521,7 +523,7 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) -> with concurrent.futures.ProcessPoolExecutor() as executor: futures = { executor.submit( - clean_adi_markdown, page_content, page_number, False + clean_adi_markdown, page_content, page_number, True ): page_content for page_content, page_number in zip( content_with_figures, page_numbers @@ -536,7 +538,7 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) -> temp_file_path, markdown_content, result.figures, page_offset=0 ) cleaned_result = clean_adi_markdown( - content_with_figures, remove_irrelevant_figures=False + content_with_figures, remove_irrelevant_figures=True ) except Exception as e: logging.error(e) From b8f7984580713cb2178aa2255ff91ac97a1cea1a Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Tue, 10 Sep 2024 21:26:59 +0100 Subject: [PATCH 30/33] Fix figure detection --- .../adi_function_app/adi_2_ai_search.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/ai_search_with_adi/adi_function_app/adi_2_ai_search.py b/ai_search_with_adi/adi_function_app/adi_2_ai_search.py index d54468b..8597550 100644 --- a/ai_search_with_adi/adi_function_app/adi_2_ai_search.py +++ b/ai_search_with_adi/adi_function_app/adi_2_ai_search.py @@ -75,9 +75,7 @@ def clean_adi_markdown( if remove_irrelevant_figures: # Remove irrelevant figures - irrelevant_figure_pattern = ( - r"
.*?.*?
\s*" - ) + irrelevant_figure_pattern = r"\s*" cleaned_text = re.sub( irrelevant_figure_pattern, "", cleaned_text, flags=re.DOTALL ) @@ -296,7 +294,7 @@ async def process_figures_from_extracted_content( ) cropped_image = crop_image_from_pdf_page( file_path, region.page_number - 1, bounding_box - ) # page_number is 1-indexed3 + ) # page_number is 1-indexed figure_spans.append(figure.spans[0]) @@ -347,7 +345,7 @@ def create_page_wise_content(result: AnalyzeResult) -> list: page.spans[0]["offset"] : page.spans[0]["offset"] + page.spans[0]["length"] ] page_wise_content.append(page_content) - page_numbers.append(page_number) + page_numbers.append(page_number + 1) page_offsets.append(page.spans[0]["offset"]) return page_wise_content, page_numbers, page_offsets @@ -535,7 +533,11 @@ async def process_adi_2_ai_search(record: dict, chunk_by_page: bool = False) -> else: markdown_content = result.content content_with_figures = await process_figures_from_extracted_content( - temp_file_path, markdown_content, result.figures, page_offset=0 + temp_file_path, + markdown_content, + result.figures, + page_offset=0, + page_number=1, ) cleaned_result = clean_adi_markdown( content_with_figures, remove_irrelevant_figures=True From 0e6c58ff3a680bfdf7f9f24f865c9c88a477adc8 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Wed, 11 Sep 2024 00:43:39 +0100 Subject: [PATCH 31/33] Refactor location --- README.md | 1 + .../.funcignore | 0 adi_function_app/README.md | 206 ++++++++++++++++++ .../adi_2_ai_search.py | 0 .../environment.py | 0 .../function_app.py | 0 .../host.json | 0 .../images/Indexing vs Indexing with ADI.png | Bin .../key_phrase_extraction.py | 0 .../pre_embedding_cleaner.py | 0 .../requirements.txt | 0 .../storage_account.py | 0 .../ai_search => ai_search}/README.md | 0 .../ai_search => ai_search}/ai_search.py | 0 .../ai_search => ai_search}/deploy.py | 0 .../ai_search => ai_search}/environment.py | 0 .../ai_search => ai_search}/rag_documents.py | 0 .../ai_search => ai_search}/requirements.txt | 0 ai_search_with_adi/README.md | 68 ------ ai_search_with_adi/adi_function_app/README.md | 144 ------------ ai_search_with_adi/ai_search/.env | 20 -- 21 files changed, 207 insertions(+), 232 deletions(-) rename {ai_search_with_adi/adi_function_app => adi_function_app}/.funcignore (100%) create mode 100644 adi_function_app/README.md rename {ai_search_with_adi/adi_function_app => adi_function_app}/adi_2_ai_search.py (100%) rename {ai_search_with_adi/adi_function_app => adi_function_app}/environment.py (100%) rename {ai_search_with_adi/adi_function_app => adi_function_app}/function_app.py (100%) rename {ai_search_with_adi/adi_function_app => adi_function_app}/host.json (100%) rename {ai_search_with_adi => adi_function_app}/images/Indexing vs Indexing with ADI.png (100%) rename {ai_search_with_adi/adi_function_app => adi_function_app}/key_phrase_extraction.py (100%) rename {ai_search_with_adi/adi_function_app => adi_function_app}/pre_embedding_cleaner.py (100%) rename {ai_search_with_adi/adi_function_app => adi_function_app}/requirements.txt (100%) rename {ai_search_with_adi/adi_function_app => adi_function_app}/storage_account.py (100%) rename {ai_search_with_adi/ai_search => ai_search}/README.md (100%) rename {ai_search_with_adi/ai_search => ai_search}/ai_search.py (100%) rename {ai_search_with_adi/ai_search => ai_search}/deploy.py (100%) rename {ai_search_with_adi/ai_search => ai_search}/environment.py (100%) rename {ai_search_with_adi/ai_search => ai_search}/rag_documents.py (100%) rename {ai_search_with_adi/ai_search => ai_search}/requirements.txt (100%) delete mode 100644 ai_search_with_adi/README.md delete mode 100644 ai_search_with_adi/adi_function_app/README.md delete mode 100644 ai_search_with_adi/ai_search/.env diff --git a/README.md b/README.md index 3fbf3e6..f639bae 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ It is intended that the plugins and skills provided in this repository, are adap - `./text2sql` contains an Multi-Shot implementation for Text2SQL generation and querying which can be used to answer questions backed by a database as a knowledge base. - `./ai_search_with_adi` contains code for linking Azure Document Intelligence with AI Search to process complex documents with charts and images, and uses multi-modal models (gpt4o) to interpret and understand these. +- `./ai_search` provides an easy Python based utility for deploying an index, indexer and corresponding skillset for AI Search. The above components have been successfully used on production RAG projects to increase the quality of responses. The code provided in this repo is a sample of the implementation and should be adjusted before being used in production. diff --git a/ai_search_with_adi/adi_function_app/.funcignore b/adi_function_app/.funcignore similarity index 100% rename from ai_search_with_adi/adi_function_app/.funcignore rename to adi_function_app/.funcignore diff --git a/adi_function_app/README.md b/adi_function_app/README.md new file mode 100644 index 0000000..5c3dd3c --- /dev/null +++ b/adi_function_app/README.md @@ -0,0 +1,206 @@ +# AI Search Indexing with Azure Document Intelligence + +This portion of the repo contains code for linking Azure Document Intelligence with AI Search to process complex documents with charts and images, and uses multi-modal models (gpt4o) to interpret and understand these. + +The implementation in Python, although it can easily be adapted for C# or another language. The code is designed to run in an Azure Function App inside the tenant. + +**This approach makes use of Azure Document Intelligence v4.0 which is still in preview.** + +## High Level Workflow + +A common way to perform document indexing, is to either extract the text content or use [optical character recognition](https://learn.microsoft.com/en-us/azure/search/cognitive-search-skill-ocr) to gather the text content before indexing. Whilst this works well for simple files that contain mainly text based information, the response quality diminishes significantly when the documents contain mainly charts and images, such as a PowerPoint presentation. + +To solve this issue and to ensure that good quality information is extracted from the document, an indexer using [Azure Document Intelligence (ADI)](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/overview?view=doc-intel-4.0.0) is developed with [Custom Skills](https://learn.microsoft.com/en-us/azure/search/cognitive-search-custom-skill-web-api): + +![High level workflow for indexing with Azure Document Intelligence based skills](./images/Indexing%20vs%20Indexing%20with%20ADI.png "Indexing with Azure Document Intelligence Approach") + +Instead of using OCR to extract the contents of the document, ADIv4 is used to analyse the layout of the document and convert it to a Markdown format. The Markdown format brings benefits such as: + +- Table layout +- Section and header extraction with Markdown headings +- Figure and image extraction + +Once the Markdown is obtained, several steps are carried out: + +1. **Extraction of images / charts**. The figures identified are extracted from the original document and passed to a multi-modal model (gpt4o in this case) for analysis. We obtain a description and summary of the chart / image to infer the meaning of the figure. This allows us to index and perform RAG analysis the information that is visually obtainable from a chart, without it being explicitly mentioned in the text surrounding. The information is added back into the original chart. + +2. **Extraction of sections and headers**. The sections and headers are extracted from the document and returned additionally to the indexer under a separate field. This allows us to store them as a separate field in the index and therefore surface the most relevant chunks. + +3. **Cleaning of Markdown**. The final markdown content is cleaned of any characters or unsupported Markdown elements that we do not want in the chunk e.g. non-relevant images. + +Page wise analysis in ADI is used to avoid splitting tables / figures across multiple chunks, when the chunking is performed. + +The properties returned from the ADI Custom Skill are then used to perform the following skills: + +- Pre-vectorisation cleaning +- Keyphrase extraction +- Vectorisation + +## Provided Notebooks \& Utilities + +- `./adi_function_app` provides a pre-built Python function app that communicates with Azure Document Intelligence, Azure OpenAI etc to perform the Markdown conversion, extraction of figures, figure understanding and corresponding cleaning of Markdown. +- `./rag_with_ai_search.ipynb` provides example of how to utilise the AI Search plugin to query the index. + +## Deploying AI Search Setup + +To deploy the pre-built index and associated indexer / skillset setup, see instructions in `./ai_search/README.md`. + +## ADI Custom Skill + +Deploy the associated function app and required resources. You can then experiment with the custom skill by sending an HTTP request in the AI Search JSON format to the `/adi_2_ai_search` HTTP endpoint. + +To use with an index, either use the utility to configure a indexer in the provided form, or integrate the skill with your skillset pipeline. + +### Deployment Steps + +1. Update `.env` file with the associated values. Not all values are required dependent on whether you are using System / User Assigned Identities or a Key based authentication. Use this template to update the environment variables in the function app. +2. Make sure the infra and required identities are setup. This setup requires Azure Document Intelligence and GPT4o. +3. [Deploy your function app](https://learn.microsoft.com/en-us/azure/azure-functions/functions-deployment-technologies?tabs=windows) and test with a HTTP request. + +### Code Files + +#### function_app.py + +`./indexer/adi_function_app.py` contains the HTTP entrypoints for the ADI skill and the other provided utility skills. + +#### adi_2_aisearch + +`./indexer/adi_2_aisearch.py` contains the methods for content extraction with ADI. The key methods are: + +##### analyse_document + +This method takes the passed file, uploads it to ADI and retrieves the Markdown format. + +##### process_figures_from_extracted_content + +This method takes the detected figures, and crops them out of the page to save them as images. It uses the `understand_image_with_vlm` to communicate with Azure OpenAI to understand the meaning of the extracted figure. + +`update_figure_description` is used to update the original Markdown content with the description and meaning of the figure. + +##### clean_adi_markdown + +This method performs the final cleaning of the Markdown contents. In this method, the section headings and page numbers are extracted for the content to be returned to the indexer. + +### Input Format + +The ADI Skill conforms to the [Azure AI Search Custom Skill Input Format](https://learn.microsoft.com/en-gb/azure/search/cognitive-search-custom-skill-web-api?WT.mc_id=Portal-Microsoft_Azure_Search#sample-input-json-structure). AI Search will automatically build this format if you use the utility file provided in this repo to build your indexer and skillset. + +```json +{ + "values": [ + { + "recordId": "0", + "data": { + "source": "" + } + }, + { + "recordId": "1", + "data": { + "source": "" + } + } + ] +} +``` + +### Output Format + +The ADI Skill conforms to the [Azure AI Search Custom Skill Output Format](https://learn.microsoft.com/en-gb/azure/search/cognitive-search-custom-skill-web-api?WT.mc_id=Portal-Microsoft_Azure_Search#sample-output-json-structure). + +If `chunk_by_page` header is `True` (recommended): + +```json +{ + "values": [ + { + "recordId": "0", + "data": { + "extracted_content": [ + { + "page_number": 1, + "sections": [ + "" + ], + "content": "" + }, + { + "page_number": 2, + "sections": [ + "" + ], + "content": "" + } + ] + } + }, + { + "recordId": "1", + "data": { + "extracted_content": [ + { + "page_number": 1, + "sections": [ + "" + ], + "content": "" + }, + { + "page_number": 2, + "sections": [ + "" + ], + "content": "" + } + ] + } + } + ] +} +``` + +If `chunk_by_page` header is `False`: + +```json +{ + "values": [ + { + "recordId": "0", + "data": { + "extracted_content": { + "sections": [ + "" + ], + "content": "" + } + } + }, + { + "recordId": "1", + "data": { + "extracted_content": { + "sections": [ + "" + ], + "content": "" + } + } + } + ] +} +``` + +**Page wise analysis in ADI is recommended to avoid splitting tables / figures across multiple chunks, when the chunking is performed.** + +## Production Considerations + +Below are some of the considerations that should be made before using this custom skill in production: + +- This approach makes use of Azure Document Intelligence v4.0 which is still in preview. Features may change before the GA release. ADI v4.0 preview is only available in select regions. +- Azure Document Intelligence output quality varies significantly by file type. A PDF file type will producer richer outputs in terms of figure detection etc, compared to a PPTX file in our testing. + +## Possible Improvements + +Below are some possible improvements that could be made to the vectorisation approach: + +- Storing the extracted figures in blob storage for access later. This would allow the LLM to resurface the correct figure or provide a link to the give in the reference system to be displayed in the UI. diff --git a/ai_search_with_adi/adi_function_app/adi_2_ai_search.py b/adi_function_app/adi_2_ai_search.py similarity index 100% rename from ai_search_with_adi/adi_function_app/adi_2_ai_search.py rename to adi_function_app/adi_2_ai_search.py diff --git a/ai_search_with_adi/adi_function_app/environment.py b/adi_function_app/environment.py similarity index 100% rename from ai_search_with_adi/adi_function_app/environment.py rename to adi_function_app/environment.py diff --git a/ai_search_with_adi/adi_function_app/function_app.py b/adi_function_app/function_app.py similarity index 100% rename from ai_search_with_adi/adi_function_app/function_app.py rename to adi_function_app/function_app.py diff --git a/ai_search_with_adi/adi_function_app/host.json b/adi_function_app/host.json similarity index 100% rename from ai_search_with_adi/adi_function_app/host.json rename to adi_function_app/host.json diff --git a/ai_search_with_adi/images/Indexing vs Indexing with ADI.png b/adi_function_app/images/Indexing vs Indexing with ADI.png similarity index 100% rename from ai_search_with_adi/images/Indexing vs Indexing with ADI.png rename to adi_function_app/images/Indexing vs Indexing with ADI.png diff --git a/ai_search_with_adi/adi_function_app/key_phrase_extraction.py b/adi_function_app/key_phrase_extraction.py similarity index 100% rename from ai_search_with_adi/adi_function_app/key_phrase_extraction.py rename to adi_function_app/key_phrase_extraction.py diff --git a/ai_search_with_adi/adi_function_app/pre_embedding_cleaner.py b/adi_function_app/pre_embedding_cleaner.py similarity index 100% rename from ai_search_with_adi/adi_function_app/pre_embedding_cleaner.py rename to adi_function_app/pre_embedding_cleaner.py diff --git a/ai_search_with_adi/adi_function_app/requirements.txt b/adi_function_app/requirements.txt similarity index 100% rename from ai_search_with_adi/adi_function_app/requirements.txt rename to adi_function_app/requirements.txt diff --git a/ai_search_with_adi/adi_function_app/storage_account.py b/adi_function_app/storage_account.py similarity index 100% rename from ai_search_with_adi/adi_function_app/storage_account.py rename to adi_function_app/storage_account.py diff --git a/ai_search_with_adi/ai_search/README.md b/ai_search/README.md similarity index 100% rename from ai_search_with_adi/ai_search/README.md rename to ai_search/README.md diff --git a/ai_search_with_adi/ai_search/ai_search.py b/ai_search/ai_search.py similarity index 100% rename from ai_search_with_adi/ai_search/ai_search.py rename to ai_search/ai_search.py diff --git a/ai_search_with_adi/ai_search/deploy.py b/ai_search/deploy.py similarity index 100% rename from ai_search_with_adi/ai_search/deploy.py rename to ai_search/deploy.py diff --git a/ai_search_with_adi/ai_search/environment.py b/ai_search/environment.py similarity index 100% rename from ai_search_with_adi/ai_search/environment.py rename to ai_search/environment.py diff --git a/ai_search_with_adi/ai_search/rag_documents.py b/ai_search/rag_documents.py similarity index 100% rename from ai_search_with_adi/ai_search/rag_documents.py rename to ai_search/rag_documents.py diff --git a/ai_search_with_adi/ai_search/requirements.txt b/ai_search/requirements.txt similarity index 100% rename from ai_search_with_adi/ai_search/requirements.txt rename to ai_search/requirements.txt diff --git a/ai_search_with_adi/README.md b/ai_search_with_adi/README.md deleted file mode 100644 index e1e2b47..0000000 --- a/ai_search_with_adi/README.md +++ /dev/null @@ -1,68 +0,0 @@ -# AI Search Indexing with Azure Document Intelligence - -This portion of the repo contains code for linking Azure Document Intelligence with AI Search to process complex documents with charts and images, and uses multi-modal models (gpt4o) to interpret and understand these. - -The implementation in Python, although it can easily be adapted for C# or another language. The code is designed to run in an Azure Function App inside the tenant. - -**This approach makes use of Azure Document Intelligence v4.0 which is still in preview.** - -## High Level Workflow - -A common way to perform document indexing, is to either extract the text content or use [optical character recognition](https://learn.microsoft.com/en-us/azure/search/cognitive-search-skill-ocr) to gather the text content before indexing. Whilst this works well for simple files that contain mainly text based information, the response quality diminishes significantly when the documents contain mainly charts and images, such as a PowerPoint presentation. - -To solve this issue and to ensure that good quality information is extracted from the document, an indexer using [Azure Document Intelligence (ADI)](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/overview?view=doc-intel-4.0.0) is developed with [Custom Skills](https://learn.microsoft.com/en-us/azure/search/cognitive-search-custom-skill-web-api): - -![High level workflow for indexing with Azure Document Intelligence based skills](./images/Indexing%20vs%20Indexing%20with%20ADI.png "Indexing with Azure Document Intelligence Approach") - -Instead of using OCR to extract the contents of the document, ADIv4 is used to analyse the layout of the document and convert it to a Markdown format. The Markdown format brings benefits such as: - -- Table layout -- Section and header extraction with Markdown headings -- Figure and image extraction - -Once the Markdown is obtained, several steps are carried out: - -1. **Extraction of images / charts**. The figures identified are extracted from the original document and passed to a multi-modal model (gpt4o in this case) for analysis. We obtain a description and summary of the chart / image to infer the meaning of the figure. This allows us to index and perform RAG analysis the information that is visually obtainable from a chart, without it being explicitly mentioned in the text surrounding. The information is added back into the original chart. - -2. **Extraction of sections and headers**. The sections and headers are extracted from the document and returned additionally to the indexer under a separate field. This allows us to store them as a separate field in the index and therefore surface the most relevant chunks. - -3. **Cleaning of Markdown**. The final markdown content is cleaned of any characters or unsupported Markdown elements that we do not want in the chunk e.g. non-relevant images. - -Page wise analysis in ADI is used to avoid splitting tables / figures across multiple chunks, when the chunking is performed. - -The properties returned from the ADI Custom Skill are then used to perform the following skills: - -- Pre-vectorisation cleaning -- Keyphrase extraction -- Vectorisation - -## Provided Notebooks \& Utilities - -- `./ai_search.py`, `./deploy.py` provide an easy Python based utility for deploying an index, indexer and corresponding skillset for AI Search. -- `./adi_function_apps/indexer` provides a pre-built Python function app that communicates with Azure Document Intelligence, Azure OpenAI etc to perform the Markdown conversion, extraction of figures, figure understanding and corresponding cleaning of Markdown. -- `./rag_with_ai_search.ipynb` provides example of how to utilise the AI Search plugin to query the index. - -## Deploying AI Search Setup - -To deploy the pre-built index and associated indexer / skillset setup, see instructions in `./ai_search/README.md`. - -## ADI Custom Skill - -Deploy the associated function app and required resources. You can then experiment with the custom skill by sending an HTTP request in the AI Search JSON format to the `/adi_2_ai_search` HTTP endpoint. - -To use with an index, either use the utility to configure a indexer in the provided form, or integrate the skill with your skillset pipeline. - -Steps for deployment of the function app can be found in `./adi_function_app/README.md`. - -## Production Considerations - -Below are some of the considerations that should be made before using this custom skill in production: - -- This approach makes use of Azure Document Intelligence v4.0 which is still in preview. Features may change before the GA release. ADI v4.0 preview is only available in select regions. -- Azure Document Intelligence output quality varies significantly by file type. A PDF file type will producer richer outputs in terms of figure detection etc, compared to a PPTX file in our testing. - -## Possible Improvements - -Below are some possible improvements that could be made to the vectorisation approach: - -- Storing the extracted figures in blob storage for access later. This would allow the LLM to resurface the correct figure or provide a link to the give in the reference system to be displayed in the UI. diff --git a/ai_search_with_adi/adi_function_app/README.md b/ai_search_with_adi/adi_function_app/README.md deleted file mode 100644 index b718fd3..0000000 --- a/ai_search_with_adi/adi_function_app/README.md +++ /dev/null @@ -1,144 +0,0 @@ -# AI Search Indexing with Azure Document Intelligence - Function App Setup - -The associated scripts in this portion of the repository contains the Azure Document Intelligence powered Function app. - -## Steps - -1. Update `.env` file with the associated values. Not all values are required dependent on whether you are using System / User Assigned Identities or a Key based authentication. Use this template to update the environment variables in the function app. -2. Make sure the infra and required identities are setup. This setup requires Azure Document Intelligence and GPT4o. -3. [Deploy your function app](https://learn.microsoft.com/en-us/azure/azure-functions/functions-deployment-technologies?tabs=windows) and test with a HTTP request. - -## Code Files - -### function_app.py - -`./indexer/adi_function_app.py` contains the HTTP entrypoints for the ADI skill and the other provided utility skills. - -### adi_2_aisearch - -`./indexer/adi_2_aisearch.py` contains the methods for content extraction with ADI. The key methods are: - -#### analyse_document - -This method takes the passed file, uploads it to ADI and retrieves the Markdown format. - -#### process_figures_from_extracted_content - -This method takes the detected figures, and crops them out of the page to save them as images. It uses the `understand_image_with_vlm` to communicate with Azure OpenAI to understand the meaning of the extracted figure. - -`update_figure_description` is used to update the original Markdown content with the description and meaning of the figure. - -#### clean_adi_markdown - -This method performs the final cleaning of the Markdown contents. In this method, the section headings and page numbers are extracted for the content to be returned to the indexer. - -## Input Format - -The ADI Skill conforms to the [Azure AI Search Custom Skill Input Format](https://learn.microsoft.com/en-gb/azure/search/cognitive-search-custom-skill-web-api?WT.mc_id=Portal-Microsoft_Azure_Search#sample-input-json-structure). AI Search will automatically build this format if you use the utility file provided in this repo to build your indexer and skillset. - -```json -{ - "values": [ - { - "recordId": "0", - "data": { - "source": "" - } - }, - { - "recordId": "1", - "data": { - "source": "" - } - } - ] -} -``` - -## Output Format - -The ADI Skill conforms to the [Azure AI Search Custom Skill Output Format](https://learn.microsoft.com/en-gb/azure/search/cognitive-search-custom-skill-web-api?WT.mc_id=Portal-Microsoft_Azure_Search#sample-output-json-structure). - -If `chunk_by_page` header is `True` (recommended): - -```json -{ - "values": [ - { - "recordId": "0", - "data": { - "extracted_content": [ - { - "page_number": 1, - "sections": [ - "" - ], - "content": "" - }, - { - "page_number": 2, - "sections": [ - "" - ], - "content": "" - } - ] - } - }, - { - "recordId": "1", - "data": { - "extracted_content": [ - { - "page_number": 1, - "sections": [ - "" - ], - "content": "" - }, - { - "page_number": 2, - "sections": [ - "" - ], - "content": "" - } - ] - } - } - ] -} -``` - -If `chunk_by_page` header is `False`: - -```json -{ - "values": [ - { - "recordId": "0", - "data": { - "extracted_content": { - "sections": [ - "" - ], - "content": "" - } - } - }, - { - "recordId": "1", - "data": { - "extracted_content": { - "sections": [ - "" - ], - "content": "" - } - } - } - ] -} -``` - -**Page wise analysis in ADI is recommended to avoid splitting tables / figures across multiple chunks, when the chunking is performed.** diff --git a/ai_search_with_adi/ai_search/.env b/ai_search_with_adi/ai_search/.env deleted file mode 100644 index faeeed4..0000000 --- a/ai_search_with_adi/ai_search/.env +++ /dev/null @@ -1,20 +0,0 @@ -FunctionApp__Endpoint= -FunctionApp__Key= -FunctionApp__PreEmbeddingCleaner__FunctionName=pre_embedding_cleaner -FunctionApp__ADI__FunctionName=adi_2_ai_search -FunctionApp__KeyPhraseExtractor__FunctionName=key_phrase_extractor -FunctionApp__AppRegistrationResourceId= -IdentityType= # system_assigned or user_assigned or key -AIService__AzureSearchOptions__Endpoint= -AIService__AzureSearchOptions__Identity__ClientId= -AIService__AzureSearchOptions__Key= -AIService__AzureSearchOptions__UsePrivateEndpoint= -AIService__AzureSearchOptions__Identity__FQName= -StorageAccount__FQEndpoint= -StorageAccount__ConnectionString= -StorageAccount__RagDocuments__Container= -OpenAI__ApiKey= -OpenAI__Endpoint= -OpenAI__EmbeddingModel= -OpenAI__EmbeddingDeployment= -OpenAI__EmbeddingDimensions=1536 From 31914190b59b06a85cc2d5e4082610eeeefa0427 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Wed, 11 Sep 2024 00:45:49 +0100 Subject: [PATCH 32/33] Further refactor --- .vscode/tasks.json | 2 +- README.md | 4 ++-- adi_function_app/README.md | 6 +++--- {ai_search => deploy_ai_search}/README.md | 0 {ai_search => deploy_ai_search}/ai_search.py | 0 {ai_search => deploy_ai_search}/deploy.py | 0 {ai_search => deploy_ai_search}/environment.py | 0 {ai_search => deploy_ai_search}/rag_documents.py | 0 {ai_search => deploy_ai_search}/requirements.txt | 0 9 files changed, 6 insertions(+), 6 deletions(-) rename {ai_search => deploy_ai_search}/README.md (100%) rename {ai_search => deploy_ai_search}/ai_search.py (100%) rename {ai_search => deploy_ai_search}/deploy.py (100%) rename {ai_search => deploy_ai_search}/environment.py (100%) rename {ai_search => deploy_ai_search}/rag_documents.py (100%) rename {ai_search => deploy_ai_search}/requirements.txt (100%) diff --git a/.vscode/tasks.json b/.vscode/tasks.json index b4388cb..359b710 100644 --- a/.vscode/tasks.json +++ b/.vscode/tasks.json @@ -5,7 +5,7 @@ "isBackground": true, "label": "func: host start", "options": { - "cwd": "${workspaceFolder}/ai_search_with_adi/adi_function_app" + "cwd": "${workspaceFolder}/ai_search_with_adi_function_app" }, "problemMatcher": "$func-python-watch", "type": "func" diff --git a/README.md b/README.md index f639bae..2b649a2 100644 --- a/README.md +++ b/README.md @@ -7,8 +7,8 @@ It is intended that the plugins and skills provided in this repository, are adap ## Components - `./text2sql` contains an Multi-Shot implementation for Text2SQL generation and querying which can be used to answer questions backed by a database as a knowledge base. -- `./ai_search_with_adi` contains code for linking Azure Document Intelligence with AI Search to process complex documents with charts and images, and uses multi-modal models (gpt4o) to interpret and understand these. -- `./ai_search` provides an easy Python based utility for deploying an index, indexer and corresponding skillset for AI Search. +- `./ai_search_with_adi_function_app` contains code for linking Azure Document Intelligence with AI Search to process complex documents with charts and images, and uses multi-modal models (gpt4o) to interpret and understand these. +- `./deploy_ai_search` provides an easy Python based utility for deploying an index, indexer and corresponding skillset for AI Search. The above components have been successfully used on production RAG projects to increase the quality of responses. The code provided in this repo is a sample of the implementation and should be adjusted before being used in production. diff --git a/adi_function_app/README.md b/adi_function_app/README.md index 5c3dd3c..3638b88 100644 --- a/adi_function_app/README.md +++ b/adi_function_app/README.md @@ -38,7 +38,7 @@ The properties returned from the ADI Custom Skill are then used to perform the f ## Provided Notebooks \& Utilities -- `./adi_function_app` provides a pre-built Python function app that communicates with Azure Document Intelligence, Azure OpenAI etc to perform the Markdown conversion, extraction of figures, figure understanding and corresponding cleaning of Markdown. +- `./ai_search_with_adi_function_app` provides a pre-built Python function app that communicates with Azure Document Intelligence, Azure OpenAI etc to perform the Markdown conversion, extraction of figures, figure understanding and corresponding cleaning of Markdown. - `./rag_with_ai_search.ipynb` provides example of how to utilise the AI Search plugin to query the index. ## Deploying AI Search Setup @@ -47,7 +47,7 @@ To deploy the pre-built index and associated indexer / skillset setup, see instr ## ADI Custom Skill -Deploy the associated function app and required resources. You can then experiment with the custom skill by sending an HTTP request in the AI Search JSON format to the `/adi_2_ai_search` HTTP endpoint. +Deploy the associated function app and required resources. You can then experiment with the custom skill by sending an HTTP request in the AI Search JSON format to the `/adi_2_deploy_ai_search` HTTP endpoint. To use with an index, either use the utility to configure a indexer in the provided form, or integrate the skill with your skillset pipeline. @@ -61,7 +61,7 @@ To use with an index, either use the utility to configure a indexer in the provi #### function_app.py -`./indexer/adi_function_app.py` contains the HTTP entrypoints for the ADI skill and the other provided utility skills. +`./indexer/ai_search_with_adi_function_app.py` contains the HTTP entrypoints for the ADI skill and the other provided utility skills. #### adi_2_aisearch diff --git a/ai_search/README.md b/deploy_ai_search/README.md similarity index 100% rename from ai_search/README.md rename to deploy_ai_search/README.md diff --git a/ai_search/ai_search.py b/deploy_ai_search/ai_search.py similarity index 100% rename from ai_search/ai_search.py rename to deploy_ai_search/ai_search.py diff --git a/ai_search/deploy.py b/deploy_ai_search/deploy.py similarity index 100% rename from ai_search/deploy.py rename to deploy_ai_search/deploy.py diff --git a/ai_search/environment.py b/deploy_ai_search/environment.py similarity index 100% rename from ai_search/environment.py rename to deploy_ai_search/environment.py diff --git a/ai_search/rag_documents.py b/deploy_ai_search/rag_documents.py similarity index 100% rename from ai_search/rag_documents.py rename to deploy_ai_search/rag_documents.py diff --git a/ai_search/requirements.txt b/deploy_ai_search/requirements.txt similarity index 100% rename from ai_search/requirements.txt rename to deploy_ai_search/requirements.txt From a1e8899fe40b9e9d7a48471cbf23c2e00b9ac2b2 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Wed, 11 Sep 2024 01:08:07 +0100 Subject: [PATCH 33/33] Update deployment logic --- deploy_ai_search/ai_search.py | 28 +++++++++++++++++++++++++--- deploy_ai_search/rag_documents.py | 12 ++++++------ 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/deploy_ai_search/ai_search.py b/deploy_ai_search/ai_search.py index 222ee66..75a6bf0 100644 --- a/deploy_ai_search/ai_search.py +++ b/deploy_ai_search/ai_search.py @@ -128,21 +128,24 @@ def get_semantic_search(self) -> SemanticSearch: Returns: SemanticSearch: The semantic search configuration""" - @abstractmethod def get_skills(self) -> list: """Get the skillset for the indexer. Returns: list: The skillsets used in the indexer""" - @abstractmethod + return [] + def get_indexer(self) -> SearchIndexer: """Get the indexer for the indexer.""" - @abstractmethod + return None + def get_index_projections(self) -> SearchIndexerIndexProjections: """Get the index projections for the indexer.""" + return None + def get_synonym_map_names(self) -> list[str]: """Get the synonym map names for the indexer.""" return [] @@ -150,6 +153,9 @@ def get_synonym_map_names(self) -> list[str]: def get_data_source(self) -> SearchIndexerDataSourceConnection: """Get the data source for the indexer.""" + if self.get_indexer() is None: + return None + data_deletion_detection_policy = NativeBlobSoftDeleteDeletionDetectionPolicy() data_change_detection_policy = HighWaterMarkChangeDetectionPolicy( @@ -464,6 +470,12 @@ def deploy_index(self): def deploy_skillset(self): """This function deploys the skillset.""" skills = self.get_skills() + + if len(skills) == 0: + logging.warning("No skills defined. Skipping skillset deployment.") + + return + index_projections = self.get_index_projections() skillset = SearchIndexerSkillset( @@ -481,6 +493,11 @@ def deploy_data_source(self): """This function deploys the data source.""" data_source = self.get_data_source() + if data_source is None: + logging.warning("Data source not defined. Skipping data source deployment.") + + return + result = self._search_indexer_client.create_or_update_data_source_connection( data_source ) @@ -491,6 +508,11 @@ def deploy_indexer(self): """This function deploys the indexer.""" indexer = self.get_indexer() + if indexer is None: + logging.warning("Indexer not defined. Skipping data source deployment.") + + return + result = self._search_indexer_client.create_or_update_indexer(indexer) logging.info("%s indexer created", result.name) diff --git a/deploy_ai_search/rag_documents.py b/deploy_ai_search/rag_documents.py index 2e1121a..5afa932 100644 --- a/deploy_ai_search/rag_documents.py +++ b/deploy_ai_search/rag_documents.py @@ -52,10 +52,10 @@ def __init__( self.enable_page_by_chunking = False def get_index_fields(self) -> list[SearchableField]: - """This function returns the index fields for inquiry document. + """This function returns the index fields for rag document. Returns: - list[SearchableField]: The index fields for inquiry document""" + list[SearchableField]: The index fields for rag document""" fields = [ SimpleField(name="Id", type=SearchFieldDataType.String, filterable=True), @@ -114,7 +114,7 @@ def get_index_fields(self) -> list[SearchableField]: return fields def get_semantic_search(self) -> SemanticSearch: - """This function returns the semantic search configuration for inquiry document + """This function returns the semantic search configuration for rag document Returns: SemanticSearch: The semantic search configuration""" @@ -178,7 +178,7 @@ def get_skills(self) -> list: return skills def get_index_projections(self) -> SearchIndexerIndexProjections: - """This function returns the index projections for inquiry document.""" + """This function returns the index projections for rag document.""" mappings = [ InputFieldMappingEntry(name="Chunk", source="/document/pages/*/chunk"), InputFieldMappingEntry( @@ -221,10 +221,10 @@ def get_index_projections(self) -> SearchIndexerIndexProjections: return index_projections def get_indexer(self) -> SearchIndexer: - """This function returns the indexer for inquiry document. + """This function returns the indexer for rag document. Returns: - SearchIndexer: The indexer for inquiry document""" + SearchIndexer: The indexer for rag document""" # Only place on schedule if it is not a test deployment if self.test: