Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions haystack/components/joiners/document_joiner.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,11 +142,19 @@ def _reciprocal_rank_fusion(self, document_lists):

scores_map = defaultdict(int)
documents_map = {}
for documents in document_lists:
weights = self.weights if self.weights else [1 / len(document_lists)] * len(document_lists)

# Calculate weighted reciprocal rank fusion score
for documents, weight in zip(document_lists, weights):
for rank, doc in enumerate(documents):
scores_map[doc.id] += 1 / (k + rank)
scores_map[doc.id] += (weight * len(document_lists)) / (k + rank)
documents_map[doc.id] = doc

# Normalize scores. Note: len(results) / k is the maximum possible score,
# achieved by being ranked first in all doc lists with non-zero weight.
for id in scores_map:
scores_map[id] /= len(document_lists) / k

for doc in documents_map.values():
doc.score = scores_map[doc.id]

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
enhancements:
- |
Introduces weighted score normalization for the DocumentJoiner's reciprocal rank fusion, enhancing the relevance of document sorting by allowing customizable influence on the final scores
40 changes: 40 additions & 0 deletions test/components/joiners/test_document_joiner.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,43 @@ def test_output_documents_not_sorted_by_score(self):
documents_2 = [Document(content="d", score=0.2)]
output = joiner.run([documents_1, documents_2])
assert output["documents"] == documents_1 + documents_2

def test_test_score_norm_with_rrf(self):
"""
Verifies reciprocal rank fusion (RRF) of the DocumentJoiner component with various weight configurations.
It creates a set of documents, forms them into two lists, and then applies multiple DocumentJoiner
instances with distinct weights to these lists. The test checks if the resulting
joined documents are correctly sorted in descending order by score, ensuring the RRF ranking works as
expected under different weighting scenarios.
"""
num_docs = 6
docs = []

for i in range(num_docs):
docs.append(Document(content=f"doc{i}"))

docs_2 = [docs[0], docs[4], docs[2], docs[5], docs[1]]
document_lists = [docs, docs_2]

joiner_1 = DocumentJoiner(join_mode="reciprocal_rank_fusion", weights=[0.5, 0.5])

joiner_2 = DocumentJoiner(join_mode="reciprocal_rank_fusion", weights=[7, 7])

joiner_3 = DocumentJoiner(join_mode="reciprocal_rank_fusion", weights=[0.7, 0.3])

joiner_4 = DocumentJoiner(join_mode="reciprocal_rank_fusion", weights=[0.6, 0.4])

joiner_5 = DocumentJoiner(join_mode="reciprocal_rank_fusion", weights=[1, 0])

joiners = [joiner_1, joiner_2, joiner_3, joiner_4, joiner_5]

for index, joiner in enumerate(joiners):
join_results = joiner.run(documents=document_lists)
is_sorted = all(
join_results["documents"][i].score >= join_results["documents"][i + 1].score
for i in range(len(join_results["documents"]) - 1)
)

assert (
is_sorted
), "Documents are not sorted in descending order by score, there is an issue with rff ranking"