From fdee07f1f65ef811b49e61fd7397f110349ce8ec Mon Sep 17 00:00:00 2001 From: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Date: Wed, 6 Dec 2023 18:17:20 +0100 Subject: [PATCH 1/6] Change type of PromptModel invocation_layer_class init param (#6497) --- test/prompt/test_prompt_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/prompt/test_prompt_model.py b/test/prompt/test_prompt_model.py index 5406172f63..78ca51f547 100644 --- a/test/prompt/test_prompt_model.py +++ b/test/prompt/test_prompt_model.py @@ -40,7 +40,7 @@ def test_constructor_with_no_supported_model(): @pytest.mark.unit -def test_constructor_with_invocation_layer_class_string(mock_auto_tokenizer): +def test_constructor_with_invocation_layer_class_string(): model = PromptModel( invocation_layer_class="haystack.nodes.prompt.invocation_layer.CohereInvocationLayer", api_key="fake_api_key" ) From c514720ccc9b547275c27f11d9ce837005406ef3 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Tue, 12 Dec 2023 12:41:06 +0100 Subject: [PATCH 2/6] fix: mypy `"str" not callable` for `PromptModelInvocationLayer` (#6529) * cast to PromptModelInvocationLayer * fix pylint pointless-exception-statement * use two variables to avoid re-assignment * black * use mocked tokenizer in unit test --- test/prompt/test_prompt_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/prompt/test_prompt_model.py b/test/prompt/test_prompt_model.py index 78ca51f547..5406172f63 100644 --- a/test/prompt/test_prompt_model.py +++ b/test/prompt/test_prompt_model.py @@ -40,7 +40,7 @@ def test_constructor_with_no_supported_model(): @pytest.mark.unit -def test_constructor_with_invocation_layer_class_string(): +def test_constructor_with_invocation_layer_class_string(mock_auto_tokenizer): model = PromptModel( invocation_layer_class="haystack.nodes.prompt.invocation_layer.CohereInvocationLayer", api_key="fake_api_key" ) From 74d8f9508dcb3c1618fb270d64dd2e78a4fd3577 Mon Sep 17 00:00:00 2001 From: robpasternak Date: Fri, 11 Aug 2023 12:01:52 +0200 Subject: [PATCH 3/6] Add normalization and weighting for `JoinDocuments` reciprocal rank fusion --- haystack/nodes/other/join_docs.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/haystack/nodes/other/join_docs.py b/haystack/nodes/other/join_docs.py index 274e90a38d..ae4dd9db8d 100644 --- a/haystack/nodes/other/join_docs.py +++ b/haystack/nodes/other/join_docs.py @@ -155,8 +155,13 @@ def _calculate_rrf(self, results): K = 61 scores_map = defaultdict(int) - for result in results: + weights = self.weights if self.weights else [1 / len(results)] * len(results) + for result, weight in zip(results, weights): for rank, doc in enumerate(result): - scores_map[doc.id] += 1 / (K + rank) + scores_map[doc.id] += (weight * len(results)) / (K + rank) + + # Normalize scores + for id in scores_map: + scores_map[id] = scores_map[id] / (len(results) / K) return scores_map From 78a6f6b163478d4ab66f341cdee6711040c548c5 Mon Sep 17 00:00:00 2001 From: robpasternak Date: Fri, 1 Sep 2023 16:52:16 +0200 Subject: [PATCH 4/6] Add weights and score normalization for reciprocal rank fusion in JoinDocuments node. --- haystack/nodes/other/join_docs.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/haystack/nodes/other/join_docs.py b/haystack/nodes/other/join_docs.py index ae4dd9db8d..b94d7d14e5 100644 --- a/haystack/nodes/other/join_docs.py +++ b/haystack/nodes/other/join_docs.py @@ -156,11 +156,14 @@ def _calculate_rrf(self, results): scores_map = defaultdict(int) weights = self.weights if self.weights else [1 / len(results)] * len(results) + + # Calculate weighted reciprocal rank fusion score for result, weight in zip(results, weights): for rank, doc in enumerate(result): scores_map[doc.id] += (weight * len(results)) / (K + rank) - # Normalize scores + # Normalize scores. Note: len(results) / K is the maximum possible score, + # achieved by being ranked first in all results with non-zero weight. for id in scores_map: scores_map[id] = scores_map[id] / (len(results) / K) From fe07486e61c51aaca8aad3227fbe3e2994225180 Mon Sep 17 00:00:00 2001 From: robpasternak Date: Fri, 15 Dec 2023 16:39:57 +0100 Subject: [PATCH 5/6] Fix black-jupyter --- ...n-docs-weighting-rrf-c52ba00a25004fd4.yaml | 6 ++++ test/nodes/test_join_documents.py | 32 +++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 releasenotes/notes/join-docs-weighting-rrf-c52ba00a25004fd4.yaml diff --git a/releasenotes/notes/join-docs-weighting-rrf-c52ba00a25004fd4.yaml b/releasenotes/notes/join-docs-weighting-rrf-c52ba00a25004fd4.yaml new file mode 100644 index 0000000000..23a31d911e --- /dev/null +++ b/releasenotes/notes/join-docs-weighting-rrf-c52ba00a25004fd4.yaml @@ -0,0 +1,6 @@ +--- +enhancements: + - | + Make `JoinDocuments` sensitive to `weights` parameter when + `join_mode` is reciprocal rank fusion. Add score normalization + for `JoinDocuments` when `join_mode` is reciprocal rank fusion. diff --git a/test/nodes/test_join_documents.py b/test/nodes/test_join_documents.py index 463aeaa577..246c107894 100644 --- a/test/nodes/test_join_documents.py +++ b/test/nodes/test_join_documents.py @@ -113,3 +113,35 @@ def test_joindocuments_concatenate_duplicate_docs_null_score(): result, _ = join_docs.run(inputs) assert len(result["documents"]) == 3 assert result["documents"] == expected_outputs["documents"] + + +@pytest.mark.unit +def test_joindocuments_rrf_weights(): + """ + Test that the reciprocal rank fusion method correctly handles weights. + """ + inputs = [ + { + "documents": [ + Document(content="text document 1", content_type="text", score=0.2), + Document(content="text document 2", content_type="text", score=0.3), + ] + }, + { + "documents": [ + Document(content="text document 3", content_type="text", score=0.7), + Document(content="text document 4", content_type="text", score=None), + ] + }, + ] + + join_docs_none = JoinDocuments(join_mode="reciprocal_rank_fusion") + result_none, _ = join_docs_none.run(inputs) + join_docs_even = JoinDocuments(join_mode="reciprocal_rank_fusion", weights=[0.5, 0.5]) + result_even, _ = join_docs_even.run(inputs) + join_docs_uneven = JoinDocuments(join_mode="reciprocal_rank_fusion", weights=[0.7, 0.3]) + result_uneven, _ = join_docs_uneven.run(inputs) + + assert result_none["documents"] == result_even["documents"] + assert result_uneven["documents"] != result_none["documents"] + assert result_uneven["documents"][0].score > result_none["documents"][0].score From 6d64d2c86c623ccd02270e23733c9c8475c80c9a Mon Sep 17 00:00:00 2001 From: robpasternak Date: Wed, 20 Dec 2023 17:29:21 +0100 Subject: [PATCH 6/6] Fix JoinDocuments test for rrf + score normalization --- test/nodes/test_join_documents.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/test/nodes/test_join_documents.py b/test/nodes/test_join_documents.py index 246c107894..ae809f4994 100644 --- a/test/nodes/test_join_documents.py +++ b/test/nodes/test_join_documents.py @@ -3,6 +3,7 @@ from haystack import Document from haystack.nodes.other.join_docs import JoinDocuments +from copy import deepcopy @pytest.mark.unit @@ -120,7 +121,7 @@ def test_joindocuments_rrf_weights(): """ Test that the reciprocal rank fusion method correctly handles weights. """ - inputs = [ + inputs_none = [ { "documents": [ Document(content="text document 1", content_type="text", score=0.2), @@ -135,12 +136,15 @@ def test_joindocuments_rrf_weights(): }, ] + inputs_even = deepcopy(inputs_none) + inputs_uneven = deepcopy(inputs_none) + join_docs_none = JoinDocuments(join_mode="reciprocal_rank_fusion") - result_none, _ = join_docs_none.run(inputs) + result_none, _ = join_docs_none.run(inputs_none) join_docs_even = JoinDocuments(join_mode="reciprocal_rank_fusion", weights=[0.5, 0.5]) - result_even, _ = join_docs_even.run(inputs) + result_even, _ = join_docs_even.run(inputs_even) join_docs_uneven = JoinDocuments(join_mode="reciprocal_rank_fusion", weights=[0.7, 0.3]) - result_uneven, _ = join_docs_uneven.run(inputs) + result_uneven, _ = join_docs_uneven.run(inputs_uneven) assert result_none["documents"] == result_even["documents"] assert result_uneven["documents"] != result_none["documents"]