From 291ca596194c4735ef5177c6c8fac1a709cd8c89 Mon Sep 17 00:00:00 2001 From: bramhanandlingala Date: Fri, 12 Jun 2026 12:55:56 +0530 Subject: [PATCH] fix(providers/common-ai): LlamaIndexEmbeddingOperator always returns vector=None --- .../ai/operators/llamaindex_embedding.py | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/providers/common/ai/src/airflow/providers/common/ai/operators/llamaindex_embedding.py b/providers/common/ai/src/airflow/providers/common/ai/operators/llamaindex_embedding.py index d85e692100202..6fab06ef6b748 100644 --- a/providers/common/ai/src/airflow/providers/common/ai/operators/llamaindex_embedding.py +++ b/providers/common/ai/src/airflow/providers/common/ai/operators/llamaindex_embedding.py @@ -125,9 +125,19 @@ def execute(self, context: Context) -> dict[str, Any]: nodes = splitter.get_nodes_from_documents(llama_docs) self.log.info("Split %d documents into %d chunks", len(llama_docs), len(nodes)) - # ``VectorStoreIndex(...)`` populates each node's ``.embedding`` as a - # side effect of building the index; capture the index so the - # variable isn't discarded. + # Pre-embed nodes so that ``.embedding`` is set on the original node + # objects before they are passed to VectorStoreIndex. VectorStoreIndex + # calls ``_get_node_with_embedding()`` which does ``node.model_copy()`` + # and attaches the embedding to the *copy*, never the original. Reading + # ``node.embedding`` after index construction therefore always returns + # ``None`` (confirmed across llama-index-core v0.10–v0.14). + # ``embed_nodes()`` inside VectorStoreIndex skips nodes whose + # ``.embedding`` is already set, so pre-embedding causes no duplicate + # API calls. + texts = [node.get_content() for node in nodes] + vectors = embed_model.get_text_embedding_batch(texts, show_progress=False) + for node, vector in zip(nodes, vectors): + node.embedding = vector index = VectorStoreIndex(nodes, embed_model=embed_model, show_progress=False) if self.persist_dir: @@ -136,8 +146,8 @@ def execute(self, context: Context) -> dict[str, Any]: # ``SentenceSplitter`` always returns ``TextNode`` instances, but the # base ``get_nodes_from_documents`` signature is typed as # ``list[BaseNode]`` (which has no ``.text``). Cast so mypy doesn't - # flag the ``.text`` access; ``node.embedding`` is populated by - # ``VectorStoreIndex`` for every node above. + # flag the ``.text`` access; ``node.embedding`` is populated by the + # pre-embed step above for every node. text_nodes = cast("list[TextNode]", nodes) chunks = [ {