diff --git a/haystack/components/joiners/document_joiner.py b/haystack/components/joiners/document_joiner.py index 0cf2535d5e..87e616bb6a 100644 --- a/haystack/components/joiners/document_joiner.py +++ b/haystack/components/joiners/document_joiner.py @@ -142,11 +142,19 @@ def _reciprocal_rank_fusion(self, document_lists): scores_map = defaultdict(int) documents_map = {} - for documents in document_lists: + weights = self.weights if self.weights else [1 / len(document_lists)] * len(document_lists) + + # Calculate weighted reciprocal rank fusion score + for documents, weight in zip(document_lists, weights): for rank, doc in enumerate(documents): - scores_map[doc.id] += 1 / (k + rank) + scores_map[doc.id] += (weight * len(document_lists)) / (k + rank) documents_map[doc.id] = doc + # Normalize scores. Note: len(results) / k is the maximum possible score, + # achieved by being ranked first in all doc lists with non-zero weight. + for id in scores_map: + scores_map[id] /= len(document_lists) / k + for doc in documents_map.values(): doc.score = scores_map[doc.id] diff --git a/releasenotes/notes/weights-normalize-docjoin-rrf-v2-9cad33012fe90a55.yaml b/releasenotes/notes/weights-normalize-docjoin-rrf-v2-9cad33012fe90a55.yaml new file mode 100644 index 0000000000..e732906c98 --- /dev/null +++ b/releasenotes/notes/weights-normalize-docjoin-rrf-v2-9cad33012fe90a55.yaml @@ -0,0 +1,4 @@ +--- +enhancements: + - | + Introduces weighted score normalization for the DocumentJoiner's reciprocal rank fusion, enhancing the relevance of document sorting by allowing customizable influence on the final scores diff --git a/test/components/joiners/test_document_joiner.py b/test/components/joiners/test_document_joiner.py index af80ccc2f8..d0a0d4cd29 100644 --- a/test/components/joiners/test_document_joiner.py +++ b/test/components/joiners/test_document_joiner.py @@ -126,3 +126,43 @@ def test_output_documents_not_sorted_by_score(self): documents_2 = [Document(content="d", score=0.2)] output = joiner.run([documents_1, documents_2]) assert output["documents"] == documents_1 + documents_2 + + def test_test_score_norm_with_rrf(self): + """ + Verifies reciprocal rank fusion (RRF) of the DocumentJoiner component with various weight configurations. + It creates a set of documents, forms them into two lists, and then applies multiple DocumentJoiner + instances with distinct weights to these lists. The test checks if the resulting + joined documents are correctly sorted in descending order by score, ensuring the RRF ranking works as + expected under different weighting scenarios. + """ + num_docs = 6 + docs = [] + + for i in range(num_docs): + docs.append(Document(content=f"doc{i}")) + + docs_2 = [docs[0], docs[4], docs[2], docs[5], docs[1]] + document_lists = [docs, docs_2] + + joiner_1 = DocumentJoiner(join_mode="reciprocal_rank_fusion", weights=[0.5, 0.5]) + + joiner_2 = DocumentJoiner(join_mode="reciprocal_rank_fusion", weights=[7, 7]) + + joiner_3 = DocumentJoiner(join_mode="reciprocal_rank_fusion", weights=[0.7, 0.3]) + + joiner_4 = DocumentJoiner(join_mode="reciprocal_rank_fusion", weights=[0.6, 0.4]) + + joiner_5 = DocumentJoiner(join_mode="reciprocal_rank_fusion", weights=[1, 0]) + + joiners = [joiner_1, joiner_2, joiner_3, joiner_4, joiner_5] + + for index, joiner in enumerate(joiners): + join_results = joiner.run(documents=document_lists) + is_sorted = all( + join_results["documents"][i].score >= join_results["documents"][i + 1].score + for i in range(len(join_results["documents"]) - 1) + ) + + assert ( + is_sorted + ), "Documents are not sorted in descending order by score, there is an issue with rff ranking"