From 03981e9ba3c6dfbf35425b2377380fb44c21dd3a Mon Sep 17 00:00:00 2001 From: robpasternak Date: Fri, 12 Jan 2024 13:48:16 +0100 Subject: [PATCH 1/4] Add weighting and score normalization for DocumentJoiner w/ reciprocal rank fusion (fix trailing whitespace) --- haystack/components/joiners/document_joiner.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/haystack/components/joiners/document_joiner.py b/haystack/components/joiners/document_joiner.py index 0cf2535d5e..87e616bb6a 100644 --- a/haystack/components/joiners/document_joiner.py +++ b/haystack/components/joiners/document_joiner.py @@ -142,11 +142,19 @@ def _reciprocal_rank_fusion(self, document_lists): scores_map = defaultdict(int) documents_map = {} - for documents in document_lists: + weights = self.weights if self.weights else [1 / len(document_lists)] * len(document_lists) + + # Calculate weighted reciprocal rank fusion score + for documents, weight in zip(document_lists, weights): for rank, doc in enumerate(documents): - scores_map[doc.id] += 1 / (k + rank) + scores_map[doc.id] += (weight * len(document_lists)) / (k + rank) documents_map[doc.id] = doc + # Normalize scores. Note: len(results) / k is the maximum possible score, + # achieved by being ranked first in all doc lists with non-zero weight. + for id in scores_map: + scores_map[id] /= len(document_lists) / k + for doc in documents_map.values(): doc.score = scores_map[doc.id] From 055323401384ef2d99395acdee5bb1aa81951100 Mon Sep 17 00:00:00 2001 From: robpasternak Date: Fri, 12 Jan 2024 13:59:58 +0100 Subject: [PATCH 2/4] Add release notes --- .../weights-normalize-docjoin-rrf-v2-9cad33012fe90a55.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 releasenotes/notes/weights-normalize-docjoin-rrf-v2-9cad33012fe90a55.yaml diff --git a/releasenotes/notes/weights-normalize-docjoin-rrf-v2-9cad33012fe90a55.yaml b/releasenotes/notes/weights-normalize-docjoin-rrf-v2-9cad33012fe90a55.yaml new file mode 100644 index 0000000000..712aaf168e --- /dev/null +++ b/releasenotes/notes/weights-normalize-docjoin-rrf-v2-9cad33012fe90a55.yaml @@ -0,0 +1,5 @@ +--- +enhancements: + - | + Enables score weighting and normalization for + DocumentJoiner node with reciprocal rank fusion. From ead49e1c0f0e7467d1b43ea8e995e6920562f94a Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Wed, 24 Jan 2024 13:48:27 +0100 Subject: [PATCH 3/4] Add unit test --- .../joiners/test_document_joiner.py | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/test/components/joiners/test_document_joiner.py b/test/components/joiners/test_document_joiner.py index af80ccc2f8..d0a0d4cd29 100644 --- a/test/components/joiners/test_document_joiner.py +++ b/test/components/joiners/test_document_joiner.py @@ -126,3 +126,43 @@ def test_output_documents_not_sorted_by_score(self): documents_2 = [Document(content="d", score=0.2)] output = joiner.run([documents_1, documents_2]) assert output["documents"] == documents_1 + documents_2 + + def test_test_score_norm_with_rrf(self): + """ + Verifies reciprocal rank fusion (RRF) of the DocumentJoiner component with various weight configurations. + It creates a set of documents, forms them into two lists, and then applies multiple DocumentJoiner + instances with distinct weights to these lists. The test checks if the resulting + joined documents are correctly sorted in descending order by score, ensuring the RRF ranking works as + expected under different weighting scenarios. + """ + num_docs = 6 + docs = [] + + for i in range(num_docs): + docs.append(Document(content=f"doc{i}")) + + docs_2 = [docs[0], docs[4], docs[2], docs[5], docs[1]] + document_lists = [docs, docs_2] + + joiner_1 = DocumentJoiner(join_mode="reciprocal_rank_fusion", weights=[0.5, 0.5]) + + joiner_2 = DocumentJoiner(join_mode="reciprocal_rank_fusion", weights=[7, 7]) + + joiner_3 = DocumentJoiner(join_mode="reciprocal_rank_fusion", weights=[0.7, 0.3]) + + joiner_4 = DocumentJoiner(join_mode="reciprocal_rank_fusion", weights=[0.6, 0.4]) + + joiner_5 = DocumentJoiner(join_mode="reciprocal_rank_fusion", weights=[1, 0]) + + joiners = [joiner_1, joiner_2, joiner_3, joiner_4, joiner_5] + + for index, joiner in enumerate(joiners): + join_results = joiner.run(documents=document_lists) + is_sorted = all( + join_results["documents"][i].score >= join_results["documents"][i + 1].score + for i in range(len(join_results["documents"]) - 1) + ) + + assert ( + is_sorted + ), "Documents are not sorted in descending order by score, there is an issue with rff ranking" From dfb973bb802bb968190714de20b1d1dceeba5adb Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Wed, 24 Jan 2024 13:57:03 +0100 Subject: [PATCH 4/4] Update release note --- .../weights-normalize-docjoin-rrf-v2-9cad33012fe90a55.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/releasenotes/notes/weights-normalize-docjoin-rrf-v2-9cad33012fe90a55.yaml b/releasenotes/notes/weights-normalize-docjoin-rrf-v2-9cad33012fe90a55.yaml index 712aaf168e..e732906c98 100644 --- a/releasenotes/notes/weights-normalize-docjoin-rrf-v2-9cad33012fe90a55.yaml +++ b/releasenotes/notes/weights-normalize-docjoin-rrf-v2-9cad33012fe90a55.yaml @@ -1,5 +1,4 @@ --- enhancements: - | - Enables score weighting and normalization for - DocumentJoiner node with reciprocal rank fusion. + Introduces weighted score normalization for the DocumentJoiner's reciprocal rank fusion, enhancing the relevance of document sorting by allowing customizable influence on the final scores