diff --git a/src/langchain_google_cloud_sql_pg/vectorstore.py b/src/langchain_google_cloud_sql_pg/vectorstore.py index 964e9df9..cbe12767 100644 --- a/src/langchain_google_cloud_sql_pg/vectorstore.py +++ b/src/langchain_google_cloud_sql_pg/vectorstore.py @@ -22,7 +22,7 @@ import numpy as np from langchain_core.documents import Document from langchain_core.embeddings import Embeddings -from langchain_core.vectorstores import VectorStore +from langchain_core.vectorstores import VectorStore, utils from sqlalchemy.engine.row import RowMapping from .engine import PostgresEngine @@ -788,7 +788,7 @@ async def amax_marginal_relevance_search_with_score_by_vector( fetch_k = fetch_k if fetch_k else self.fetch_k lambda_mult = lambda_mult if lambda_mult else self.lambda_mult embedding_list = [json.loads(row[self.embedding_column]) for row in results] - mmr_selected = maximal_marginal_relevance( + mmr_selected = utils.maximal_marginal_relevance( np.array(embedding, dtype=np.float32), embedding_list, k=k, @@ -963,76 +963,3 @@ async def is_valid_index( """ results = await self.engine._afetch(query) return bool(len(results) == 1) - - -### The following is copied from langchain-community until it's moved into core - -Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray] - - -def maximal_marginal_relevance( - query_embedding: np.ndarray, - embedding_list: list, - lambda_mult: float = 0.5, - k: int = 4, -) -> List[int]: - """Calculate maximal marginal relevance.""" - if min(k, len(embedding_list)) <= 0: - return [] - if query_embedding.ndim == 1: - query_embedding = np.expand_dims(query_embedding, axis=0) - similarity_to_query = cosine_similarity(query_embedding, embedding_list)[0] - most_similar = int(np.argmax(similarity_to_query)) - idxs = [most_similar] - selected = np.array([embedding_list[most_similar]]) - while len(idxs) < min(k, len(embedding_list)): - best_score = -np.inf - idx_to_add = -1 - similarity_to_selected = cosine_similarity(embedding_list, selected) - for i, query_score in enumerate(similarity_to_query): - if i in idxs: - continue - redundant_score = max(similarity_to_selected[i]) - equation_score = ( - lambda_mult * query_score - (1 - lambda_mult) * redundant_score - ) - if equation_score > best_score: - best_score = equation_score - idx_to_add = i - idxs.append(idx_to_add) - selected = np.append(selected, [embedding_list[idx_to_add]], axis=0) - return idxs - - -def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray: - """Row-wise cosine similarity between two equal-width matrices.""" - if len(X) == 0 or len(Y) == 0: - return np.array([]) - - X = np.array(X) - Y = np.array(Y) - if X.shape[1] != Y.shape[1]: - raise ValueError( - f"Number of columns in X and Y must be the same. X has shape {X.shape} " - f"and Y has shape {Y.shape}." - ) - try: - import simsimd as simd # type: ignore - - X = np.array(X, dtype=np.float32) - Y = np.array(Y, dtype=np.float32) - Z = 1 - simd.cdist(X, Y, metric="cosine") - if isinstance(Z, float): - return np.array([Z]) - return Z - except ImportError: - X_norm = np.linalg.norm(X, axis=1) - Y_norm = np.linalg.norm(Y, axis=1) - # Ignore divide by zero errors run time warnings as those are handled below. - with np.errstate(divide="ignore", invalid="ignore"): - similarity = np.dot(X, Y.T) / np.outer(X_norm, Y_norm) - similarity[np.isnan(similarity) | np.isinf(similarity)] = 0.0 - return similarity - - -### End code from langchain-community